Merge pull request #70 from CNugteren/development

Update to version 0.8.0
author: Cedric Nugteren <web@cedricnugteren.nl> 2016-06-28 22:32:25 +0200
committer: GitHub <noreply@github.com> 2016-06-28 22:32:25 +0200
commit: 7c13bacf129291e3e295ecb6e833788477085fa0 (patch)
tree: d114eeca418444d0b1c70cc9cce983de041235c9
parent: 181eb20bbf15cf11baaf6112b6965050c49dd543 (diff)
parent: 577f0ee1179014ece853af39d6f0ff0c87316eb3 (diff)
348 files changed, 8678 insertions, 4864 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
new file mode 100644
index 00000000..8597e43e
--- /dev/null
+++ b/.appveyor.yml
@@ -0,0 +1,64 @@
+environment:
+  global:
+    CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\clblast"
+    OPENCL_REGISTRY: "https://www.khronos.org/registry/cl"
+    OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl"
+
+platform:
+  - x64
+
+configuration:
+  - Release
+
+init:
+  - cmake --version
+  - C:\"Program Files (x86)"\"Microsoft Visual Studio 14.0"\VC\vcvarsall.bat %PLATFORM%
+
+# Creates an OpenCL library to link against. Taken from clMathLibraries/clBLAS
+install:
+  - ps: mkdir $env:OPENCL_ROOT
+  - ps: pushd $env:OPENCL_ROOT
+  - ps: $opencl_registry = $env:OPENCL_REGISTRY
+  # This downloads the source to the Khronos ICD library
+  - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git
+  - ps: mv ./OpenCL-ICD-Loader/* .
+  # This downloads all the opencl header files
+  # The cmake build files expect a directory called inc
+  - ps: mkdir inc/CL
+  - git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL
+  - ps: wget $opencl_registry/api/2.1/cl.hpp -OutFile inc/CL/cl.hpp
+  # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom }
+  # Create the static import lib in a directory called lib, so findopencl() will find it
+  - ps: mkdir lib
+  - ps: pushd lib
+  - cmake -G "NMake Makefiles" ..
+  - nmake
+  - ps: popd
+  # Switch to OpenCL 1.2 headers
+  - ps: pushd inc/CL
+  - git fetch origin opencl12:opencl12
+  - git checkout opencl12
+  - ps: popd
+  # Rename the inc directory to include, so FindOpencl() will find it
+  - ps: ren inc include
+  - ps: popd
+
+before_build:
+  - ps: mkdir $env:CLBLAST_ROOT
+  - ps: pushd $env:CLBLAST_ROOT
+  - ps: mkdir install_dir
+  - cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=install_dir -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON %APPVEYOR_BUILD_FOLDER%
+
+build_script:
+  - nmake
+  - nmake install
+
+after_build:
+  - ps: pushd $env:CLBLAST_ROOT
+  - 7z a CLBlast-Windows-x64.zip .\install_dir\*
+  - ps: mv CLBlast-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+
+artifacts:
+  - path: '*.zip'
+    name: release
+    type: zip
diff --git a/.travis.yml b/.travis.yml
index 351836de..8e1a80db 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,6 +2,10 @@ language: cpp
 sudo: required
 dist: trusty
 
+os:
+  - linux
+  - osx
+
 compiler:
   - gcc
   - clang
@@ -16,7 +20,7 @@ addons:
 
 env:
   global:
-    - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release
+    - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
     - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
     - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
 
@@ -55,7 +59,7 @@ install:
 before_script:
   - mkdir -p ${CLBLAST_ROOT}
   - pushd ${CLBLAST_ROOT}
-  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}
+  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR}
 
 script:
   - make
diff --git a/CHANGELOG b/CHANGELOG
index 76903180..b49424c9 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,21 @@
 
+Version 0.8.0
+- Added support for half-precision floating-point (fp16) in the library
+- Made it possible to compile the performance tests (clients) separately from the correctness tests
+- Made a reference BLAS and head-to-head performance comparison optional in the clients
+- Increased the verbosity of the "-verbose" option in the correctness tests
+- Refactored the host code for better compilation times and fewer lines of code
+- Added Appveyor continuous integration and increased coverage of the Travis builds
+- Improved the API documentation
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see README)
+- Added half-precision routines:
+  * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN
+  * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2
+  * Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM
+- Added non-BLAS routines:
+  * SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY (matrix copy, scaling, and/or transpose)
+
 Version 0.7.1
 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
 - Fixed a bug in the xGEMM routine related to the event incorrectly set
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02ffba1d..6deee35d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,21 +18,19 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 7)
-set(clblast_VERSION_PATCH 1)
+set(clblast_VERSION_MINOR 8)
+set(clblast_VERSION_PATCH 0)
 
 # Options and their default values
 option(SAMPLES "Enable compilation of the examples" OFF)
 option(TUNERS "Enable compilation of the tuners" OFF)
-option(TESTS "Enable compilation of the performance and correctness tests" OFF)
+option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
+option(TESTS "Enable compilation of the correctness tests" OFF)
 
 # ==================================================================================================
 
 # RPATH settings
-set(CMAKE_SKIP_BUILD_RPATH false) # Use, i.e. don't skip the full RPATH for the build tree
-set(CMAKE_BUILD_WITH_INSTALL_RPATH false) # When building, don't use the install RPATH already
-set(CMAKE_INSTALL_RPATH "") # The RPATH to be used when installing
-set(CMAKE_INSTALL_RPATH_USE_LINK_PATH false) # Don't add the automatically determined parts
+set(CMAKE_MACOSX_RPATH 1)
 
 # ==================================================================================================
 
@@ -106,46 +104,62 @@ endif()
 
 # Locates the reference BLAS libraries in case the tests need to be compiled. The "FindclBLAS.cmake"
 # and "FindCBLAS.cmake" are included.
-if(TESTS)
+if(CLIENTS OR TESTS)
   find_package(clBLAS)
   find_package(CBLAS)
   if(NOT CLBLAS_FOUND AND NOT CBLAS_FOUND)
-    message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
-    set(TESTS OFF)
+    if(TESTS)
+      message(STATUS "Could NOT find clBLAS nor a CPU BLAS, disabling the compilation of the tests")
+      set(TESTS OFF)
+    endif()
+    if(CLIENTS)
+      message(STATUS "Could NOT find clBLAS nor a CPU BLAS, head-to-head performance comparison not supported in the clients")
+    endif()
   endif()
 endif()
 
 # ==================================================================================================
 
 # Includes directories: CLBlast and OpenCL
-include_directories(${clblast_SOURCE_DIR}/include ${OPENCL_INCLUDE_DIRS})
+include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
 
 # ==================================================================================================
 
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
-set(KERNELS copy pad transpose padtranspose xaxpy xdot xger xgemm xgemv)
+set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
-set(SAMPLE_PROGRAMS_C sasum dgemv sgemm cache)
+set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
 set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
-set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
-set(PRECISIONS 32 64 3232 6464)
+set(LEVELX_ROUTINES xomatcopy)
+set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
+set(PRECISIONS 32 64 3232 6464 16)
 
 # ==================================================================================================
 
 # Gathers all source-files
-set(SOURCES src/clblast.cc src/database.cc src/routine.cc src/cache.cc
-            src/utilities.cc src/clblast_c.cc)
+set(SOURCES
+  src/database/database.cpp
+  src/routines/common.cpp
+  src/cache.cpp
+  src/clblast.cpp
+  src/clblast_c.cpp
+  src/routine.cpp
+  src/utilities.cpp
+)
 foreach(ROUTINE ${LEVEL1_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level1/${ROUTINE}.cpp)
 endforeach()
 foreach(ROUTINE ${LEVEL2_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level2/${ROUTINE}.cpp)
 endforeach()
 foreach(ROUTINE ${LEVEL3_ROUTINES})
-  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
+  set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cpp)
+endforeach()
+foreach(ROUTINE ${LEVELX_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cpp)
 endforeach()
 
 # Creates and links the library
@@ -156,6 +170,7 @@ target_link_libraries(clblast ${OPENCL_LIBRARIES})
 install(TARGETS clblast DESTINATION lib)
 install(FILES include/clblast.h DESTINATION include)
 install(FILES include/clblast_c.h DESTINATION include)
+install(FILES include/clblast_half.h DESTINATION include)
 
 # ==================================================================================================
 
@@ -178,7 +193,7 @@ if(SAMPLES)
 
   # Adds sample programs (C++)
   foreach(SAMPLE ${SAMPLE_PROGRAMS_CPP})
-    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cc)
+    add_executable(clblast_sample_${SAMPLE} samples/${SAMPLE}.cpp)
     target_link_libraries(clblast_sample_${SAMPLE} clblast ${OPENCL_LIBRARIES})
     install(TARGETS clblast_sample_${SAMPLE} DESTINATION bin)
   endforeach()
@@ -203,7 +218,7 @@ if(TUNERS)
 
   # Adds tuning executables
   foreach(KERNEL ${KERNELS})
-    add_executable(clblast_tuner_${KERNEL} src/tuning/${KERNEL}.cc)
+    add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp)
     target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
@@ -223,9 +238,8 @@ endif()
 
 # ==================================================================================================
 
-# Down from here is all test (performance and correctness) related. Note that these tests require
-# the presence of clBLAS and/or a BLAS library to act as a reference.
-if(TESTS)
+# Section for the tests: common part for both performance ('CLIENTS') and correctness ('TESTS')
+if(CLIENTS OR TESTS)
 
   # Sets the specifics for the reference BLAS libraries
   set(REF_INCLUDES )
@@ -250,28 +264,90 @@ if(TESTS)
   endif()
 
   # Sets the include directories
-  include_directories(${clblast_SOURCE_DIR}/test ${REF_INCLUDES})
+  include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES})
+
+endif()
+
+# ==================================================================================================
 
-  # Creates the common correctness-tests objects (requires CMake 2.8.8)
-  add_library(test_correctness_common OBJECT
-              test/correctness/tester.cc test/correctness/testblas.cc)
+# Section for the performance tests (i.e. the client). These compare against optionally a reference
+# library, either clBLAS or a CPU BLAS.
+if(CLIENTS)
+
+  # Visual Studio requires the sources of non-exported objects/libraries
+  set(CLIENTS_COMMON )
+  if(MSVC)
+    set(CLIENTS_COMMON ${CLIENTS_COMMON} src/utilities.cpp test/performance/client.cpp)
+  else()
+    # Creates the common performance-tests objects (requires CMake 2.8.8)
+    add_library(test_performance_common OBJECT test/performance/client.cpp)
+    set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>)
+  endif()
+
+  # Compiles the performance-tests
+  foreach(ROUTINE ${LEVEL1_ROUTINES})
+    add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
+                   test/performance/routines/level1/${ROUTINE}.cpp)
+  endforeach()
+  foreach(ROUTINE ${LEVEL2_ROUTINES})
+    add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
+                   test/performance/routines/level2/${ROUTINE}.cpp)
+  endforeach()
+  foreach(ROUTINE ${LEVEL3_ROUTINES})
+    add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
+                   test/performance/routines/level3/${ROUTINE}.cpp)
+  endforeach()
+  foreach(ROUTINE ${LEVELX_ROUTINES})
+    add_executable(clblast_client_${ROUTINE} ${CLIENTS_COMMON}
+                   test/performance/routines/levelx/${ROUTINE}.cpp)
+  endforeach()
+  foreach(ROUTINE ${ROUTINES})
+    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
+  endforeach()
+
+endif()
+
+# ==================================================================================================
+
+# Section for the correctness tests. Note that these tests require the presence of clBLAS and/or a
+# CPU BLAS library to act as a reference.
+if(TESTS)
+  enable_testing()
+
+  # Visual Studio requires the sources of non-exported objects/libraries
+  set(TESTS_COMMON )
+  if(MSVC)
+    set(TESTS_COMMON ${TESTS_COMMON} src/utilities.cpp
+        test/correctness/tester.cpp test/correctness/testblas.cpp)
+  else()
+    # Creates the common correctness-tests objects (requires CMake 2.8.8)
+    add_library(test_correctness_common OBJECT
+                test/correctness/tester.cpp test/correctness/testblas.cpp)
+    set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
+  endif()
 
   # Compiles the correctness-tests
   foreach(ROUTINE ${LEVEL1_ROUTINES})
-    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/level1/${ROUTINE}.cc)
+    add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
+                   test/correctness/routines/level1/${ROUTINE}.cpp)
   endforeach()
   foreach(ROUTINE ${LEVEL2_ROUTINES})
-    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/level2/${ROUTINE}.cc)
+    add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
+                   test/correctness/routines/level2/${ROUTINE}.cpp)
   endforeach()
   foreach(ROUTINE ${LEVEL3_ROUTINES})
-    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
-                   test/correctness/routines/level3/${ROUTINE}.cc)
+    add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
+                   test/correctness/routines/level3/${ROUTINE}.cpp)
+  endforeach()
+  foreach(ROUTINE ${LEVELX_ROUTINES})
+    add_executable(clblast_test_${ROUTINE} ${TESTS_COMMON}
+                   test/correctness/routines/levelx/${ROUTINE}.cpp)
   endforeach()
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
+    add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
   endforeach()
 
   # Adds 'alltests' target: runs all tests
@@ -283,26 +359,6 @@ if(TESTS)
   endforeach()
   add_custom_target(alltests ${ALLTESTS} DEPENDS ${ALLTESTSDEPENDS})
 
-  # Creates the common performance-tests objects (requires CMake 2.8.8)
-  add_library(test_performance_common OBJECT test/performance/client.cc)
-
-  # Compiles the performance-tests
-  foreach(ROUTINE ${LEVEL1_ROUTINES})
-    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/level1/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${LEVEL2_ROUTINES})
-    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/level2/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${LEVEL3_ROUTINES})
-    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
-                   test/performance/routines/level3/${ROUTINE}.cc)
-  endforeach()
-  foreach(ROUTINE ${ROUTINES})
-    target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
-    install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
-  endforeach()
-
 endif()
+
 # ==================================================================================================
diff --git a/README.md b/README.md
index e4564c26..ddd841e2 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,14 @@
 CLBlast: The tuned OpenCL BLAS library
 ================
 
-[![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast)
+| | master | development |
+|-----|-----|-----|
+| Linux/OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=development)](https://travis-ci.org/CNugteren/CLBlast/branches) |
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=development&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) |
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices.
 
-__Note that the CLBlast library is actively being developed, and might not be mature enough for production environments__. This preview-version doesn't support the less commonly used routines yet: they will be added in due time. It also lacks extensive tuning on some common OpenCL platforms: __out-of-the-box performance on some devices might be poor__. See below for more details (and how to tune yourself).
+This preview-version is not yet tuned for all OpenCL devices: __out-of-the-box performance on some devices might be poor__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library.
 
 
 Why CLBlast and not clBLAS or cuBLAS?
@@ -16,21 +19,22 @@ Use CLBlast instead of clBLAS:
 
 * When you care about achieving maximum performance.
 * When you want to be able to inspect the BLAS kernels or easily customize them to your needs.
-* When you run on exotic OpenCL devices which you need to tune yourself.
+* When you run on exotic OpenCL devices for which you need to tune yourself.
 * When you are still running on OpenCL 1.1 hardware.
 * When you value an organized and modern C++ codebase.
 * When you target Intel CPUs and GPUs or embedded devices
+* When you can benefit from the increased performance of half-precision fp16 data-types.
 
 Use CLBlast instead of cuBLAS:
 
 * When you want your code to run on devices other than NVIDIA CUDA-enabled GPUs.
-* When you want to tune for a specific configuration (e.g. rectangular matrix-sizes)
+* When you want to tune for a specific configuration (e.g. rectangular matrix-sizes).
 * When you sleep better if you know that the library you use is open-source.
+* When you are using OpenCL rather than CUDA.
 
 When not to use CLBlast:
 
 * When you run on NVIDIA's CUDA-enabled GPUs only and can benefit from cuBLAS's assembly-level tuned kernels.
-* When you need those BLAS routines that are not yet supported by CLBlast.
 
 
 Compilation and installation
@@ -52,14 +56,6 @@ The pre-requisites for compilation of CLBlast are:
   - Intel OpenCL
   - Beignet
 
-Furthermore, to build the (optional) correctness and performance tests, another BLAS library is needed to serve as a reference. This can be either:
-
-* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD)
-* A regular CPU Netlib BLAS library, e.g.:
-  - OpenBLAS
-  - BLIS
-  - Accelerate
-
 An example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder):
 
     mkdir build
@@ -90,7 +86,9 @@ Or alternatively the plain C version:
 
     #include <clblast_c.h>
 
-Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the `clblast.h` include file and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in `samples/`.
+Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
+
+    cmake -DSAMPLES=ON ..
 
 
 Using the tuners (optional)
@@ -99,6 +97,7 @@ Using the tuners (optional)
 The CLBlast library will be tuned in the future for the most commonly used OpenCL devices. This pre-release of CLBlast is only tuned for a limited number of devices, in particular those with the following `CL_DEVICE_NAME` values:
 
 * NVIDIA GPUs:
+  - GRID K520
   - GeForce GTX 480
   - GeForce GTX 680
   - GeForce GTX 750 Ti
@@ -111,8 +110,10 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - Tahiti
   - Hawaii
   - Pitcairn
-  - R9 M370X
+  - Radeon R9 M370X Compute Engine
 * Intel GPUs:
+  - HD Graphics Haswell Ultrabook GT2 Mobile
+  - HD Graphics Skylake ULT GT2
   - Iris
   - Iris Pro
 * Intel CPUs:
@@ -123,15 +124,15 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - ARM Mali-T628 GPU
   - Intel MIC
 
-If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners:
+If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners by specifing `-DTUNERS=ON`, for example as follows:
 
     cmake -DTUNERS=ON ..
 
-Note that CLBlast's tuners are based on the CLTune auto-tuning library, which has to be installed separately (version 1.7.0 or higher). CLTune is available from GitHub.
+Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
 
 Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
 
-The tuners output a JSON-file with the results. The best results need to be added to `include/internal/database/xxxxx.h` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
 
 In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
 
@@ -144,110 +145,125 @@ In summary, tuning the entire library for your device can be done as follows (st
     make
 
 
-Compiling the correctness and performance tests (optional)
+Compiling the correctness tests (optional)
 -------------
 
-To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled:
+To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled by specifying `-DTESTS=ON`, for example as follows:
 
     cmake -DTESTS=ON ..
 
-Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is best tested against [clBLAS](http://github.com/clMathLibraries/clBLAS) for correctness. If the library clBLAS is not installed on your system, it will use a regular CPU BLAS library to test against. If both are present, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables.
+To build these tests, another BLAS library is needed to serve as a reference. This can be either:
 
-With the `-DTESTS=ON` flag, additional performance tests are compiled. These come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against clBLAS and/or a CPU BLAS library.
+* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD)
+* A regular CPU Netlib BLAS library, e.g.:
+  - OpenBLAS
+  - BLIS
+  - Accelerate
 
+Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
 
-Performance remarks
+All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
+
+
+Compiling the performance tests/clients (optional)
 -------------
 
-The CLBlast library provides pre-tuned parameter-values for a number of OpenCL devices. If your device is not among these, then out-of-the-box performance might be poor. Even if the device is included performance might be poor in some cases: __the preview version is not thoroughly tested for performance yet__. See above under `Using the tuners` to find out how to tune for your device.
+To test the performance of CLBlast and compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS) or a CPU BLAS library (see above for requirements), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
 
-The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared against a tuned version of the clBLAS library. The graphs of the level-3 routines (Xgemm, Xsymm, Xsyrk) show the strong points of CLBlast:
+    cmake -DCLIENTS=ON ..
 
-* The library reaches a high peak performance for large matrix sizes, in some cases a factor 2 more than clBLAS.
-* The performance for non-power of 2 values (e.g. 1000) is roughly equal to power of 2 cases (e.g. 1024). This is not the case for clBLAS, which sometimes shows a drop of a factor 2.
-* The performance is also constant for different layouts and transpose options. Again, this is not the case for clBLAS.
+The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1` or `-cblas 1` to select a library to test against.
 
-The graphs also show the current weak points of CLBlast: for small sizes the benefit is minimal or non-existent, and for some specific configurations clBLAS is still faster.
+The folder `doc/performance` contains some PDF files with performance results on tested devices. Performance is compared in this case against a tuned version of the clBLAS library. These graphs can be generated automatically on your own device. First, compile CLBlast with the clients enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `scripts/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0 from the `build` subdirectory:
 
-These graphs can be generated automatically on your own device. First, compile CLBlast with the tests enabled. Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable. Finally, run one of the graph-scripts found in `test/performance/graphs` using R. For example, to generate the Xgemm PDF on device 1 of platform 0:
+    Rscript ../scripts/graphs/xgemm.r 0 1
 
-    Rscript path/to/test/performance/graphs/xgemm.r 0 1
+Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.
 
 
 Supported routines
 -------------
 
-CLBlast is in active development but already supports almost all the BLAS routines. The supported routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
-
-| Level-1  | S | D | C | Z |
-| ---------|---|---|---|---|
-| xSWAP    | ✔ | ✔ | ✔ | ✔ |
-| xSCAL    | ✔ | ✔ | ✔ | ✔ |
-| xCOPY    | ✔ | ✔ | ✔ | ✔ |
-| xAXPY    | ✔ | ✔ | ✔ | ✔ |
-| xDOT     | ✔ | ✔ | - | - |
-| xDOTU    | - | - | ✔ | ✔ |
-| xDOTC    | - | - | ✔ | ✔ |
-| xNRM2    | ✔ | ✔ | ✔ | ✔ |
-| xASUM    | ✔ | ✔ | ✔ | ✔ |
-| IxAMAX   | ✔ | ✔ | ✔ | ✔ |
-
-| Level-2  | S | D | C | Z |
-| ---------|---|---|---|---|
-| xGEMV    | ✔ | ✔ | ✔ | ✔ |
-| xGBMV    | ✔ | ✔ | ✔ | ✔ |
-| xHEMV    | - | - | ✔ | ✔ |
-| xHBMV    | - | - | ✔ | ✔ |
-| xHPMV    | - | - | ✔ | ✔ |
-| xSYMV    | ✔ | ✔ | - | - |
-| xSBMV    | ✔ | ✔ | - | - |
-| xSPMV    | ✔ | ✔ | - | - |
-| xTRMV    | ✔ | ✔ | ✔ | ✔ |
-| xTBMV    | ✔ | ✔ | ✔ | ✔ |
-| xTPMV    | ✔ | ✔ | ✔ | ✔ |
-| xGER     | ✔ | ✔ | - | - |
-| xGERU    | - | - | ✔ | ✔ |
-| xGERC    | - | - | ✔ | ✔ |
-| xHER     | - | - | ✔ | ✔ |
-| xHPR     | - | - | ✔ | ✔ |
-| xHER2    | - | - | ✔ | ✔ |
-| xHPR2    | - | - | ✔ | ✔ |
-| xSYR     | ✔ | ✔ | - | - |
-| xSPR     | ✔ | ✔ | - | - |
-| xSYR2    | ✔ | ✔ | - | - |
-| xSPR2    | ✔ | ✔ | - | - |
-
-| Level-3  | S | D | C | Z |
-| ---------|---|---|---|---|
-| xGEMM    | ✔ | ✔ | ✔ | ✔ |
-| xSYMM    | ✔ | ✔ | ✔ | ✔ |
-| xHEMM    | - | - | ✔ | ✔ |
-| xSYRK    | ✔ | ✔ | ✔ | ✔ |
-| xHERK    | - | - | ✔ | ✔ |
-| xSYR2K   | ✔ | ✔ | ✔ | ✔ |
-| xHER2K   | - | - | ✔ | ✔ |
-| xTRMM    | ✔ | ✔ | ✔ | ✔ |
-
-In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
-
-| Additional | S | D | C | Z |
-| -----------|---|---|---|---|
-| xSUM       | ✔ | ✔ | ✔ | ✔ |
-| IxMAX      | ✔ | ✔ | ✔ | ✔ |
-| IxMIN      | ✔ | ✔ | ✔ | ✔ |
-
-Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
-
-| Unsupported | S | D | C | Z |
-| ------------|---|---|---|---|
-| xROTG       |   |   | - | - |
-| xROTMG      |   |   | - | - |
-| xROT        |   |   | - | - |
-| xROTM       |   |   | - | - |
-| xTRSV       |   |   |   |   |
-| xTBSV       |   |   |   |   |
-| xTPSV       |   |   |   |   |
-| xTRSM       |   |   |   |   |
+CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-BLAS routines. The supported BLAS routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all. The different data-types supported by the library are:
+
+* __S:__ Single-precision 32-bit floating-point (`float`).
+* __D:__ Double-precision 64-bit floating-point (`double`).
+* __C:__ Complex single-precision 2x32-bit floating-point (`std::complex<float>`).
+* __Z:__ Complex double-precision 2x64-bit floating-point (`std::complex<double>`).
+* __H:__ Half-precision 16-bit floating-point (`cl_half`). See section 'Half precision' for more information.
+
+| Level-1  | S | D | C | Z | H |
+| ---------|---|---|---|---|---|
+| xSWAP    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xSCAL    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xCOPY    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xDOT     | ✔ | ✔ | - | - | ✔ |
+| xDOTU    | - | - | ✔ | ✔ | - |
+| xDOTC    | - | - | ✔ | ✔ | - |
+| xNRM2    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xASUM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| IxAMAX   | ✔ | ✔ | ✔ | ✔ | ✔ |
+
+| Level-2  | S | D | C | Z | H |
+| ---------|---|---|---|---|---|
+| xGEMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGBMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHEMV    | - | - | ✔ | ✔ | - |
+| xHBMV    | - | - | ✔ | ✔ | - |
+| xHPMV    | - | - | ✔ | ✔ | - |
+| xSYMV    | ✔ | ✔ | - | - | ✔ |
+| xSBMV    | ✔ | ✔ | - | - | ✔ |
+| xSPMV    | ✔ | ✔ | - | - | ✔ |
+| xTRMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTBMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTPMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGER     | ✔ | ✔ | - | - | ✔ |
+| xGERU    | - | - | ✔ | ✔ | - |
+| xGERC    | - | - | ✔ | ✔ | - |
+| xHER     | - | - | ✔ | ✔ | - |
+| xHPR     | - | - | ✔ | ✔ | - |
+| xHER2    | - | - | ✔ | ✔ | - |
+| xHPR2    | - | - | ✔ | ✔ | - |
+| xSYR     | ✔ | ✔ | - | - | ✔ |
+| xSPR     | ✔ | ✔ | - | - | ✔ |
+| xSYR2    | ✔ | ✔ | - | - | ✔ |
+| xSPR2    | ✔ | ✔ | - | - | ✔ |
+
+| Level-3  | S | D | C | Z | H |
+| ---------|---|---|---|---|---|
+| xGEMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHEMM    | - | - | ✔ | ✔ | - |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHERK    | - | - | ✔ | ✔ | - |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHER2K   | - | - | ✔ | ✔ | - |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+
+In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
+
+| Level-X    | S | D | C | Z | H |
+| -----------|---|---|---|---|---|
+| xSUM       | ✔ | ✔ | ✔ | ✔ | ✔ |
+| IxMAX      | ✔ | ✔ | ✔ | ✔ | ✔ |
+| IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ |
+
+Some less commonly used BLAS routines are not yet supported yet by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTRSV, xTBSV, xTPSV, and xTRSM.
+
+
+Half precision (fp16)
+-------------
+
+The half-precison fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
+
+Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions:
+
+* `half FloatToHalf(const float value)`: Converts a 32-bits floating-point value to a 16-bits floating-point value.
+* `float HalfToFloat(const half value)`: Converts a 16-bits floating-point value to a 32-bits floating-point value.
+
+The `samples/haxpy.c` example shows how to use these convencience functions when calling the half-precision BLAS routine HAXPY.
 
 
 Contributing
@@ -257,7 +273,7 @@ Contributions are welcome in the form of tuning results for OpenCL devices previ
 
 The contributing authors (code, pull requests, testing) so far are:
 
-* [Cedric Nugteren](http://www.cedricnugteren.nl)
+* [Cedric Nugteren](http://www.cedricnugteren.nl) - main author
 * [Anton Lokhmotov](https://github.com/psyhtest)
 * [Dragan Djuric](https://github.com/blueberry)
 * [Marco Hutter](https://github.com/gpus)
@@ -270,14 +286,8 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
 * [dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)
 
+
 Support us
 -------------
 
 This project started in March 2015 as an evenings and weekends free-time project next to a full-time job for Cedric Nugteren. If you are in the position to support the project by OpenCL-hardware donations or otherwise, please find contact information on the [website of the main author](http://www.cedricnugteren.nl).
-
-
-To-do list before release of version 1.0
--------------
-
-- Add half-precision routines (e.g. HGEMM)
-- Add API documentation
diff --git a/doc/clblast.md b/doc/clblast.md
index 9c9b9a6f..5105d023 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -5,7 +5,7 @@ CLBlast: API reference
 xSWAP: Swap two vectors
 -------------
 
-Interchanges the contents of vectors x and y.
+Interchanges _n_ elements of vectors _x_ and _y_.
 
 C++ API:
 ```
@@ -34,17 +34,21 @@ StatusCode CLBlastZswap(const size_t n,
                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHswap(const size_t n,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SWAP:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
 * `const size_t x_offset`: The offset in elements from the start of the output x vector.
-* `const size_t x_inc`: Stride/increment of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -53,7 +57,7 @@ Arguments to SWAP:
 xSCAL: Vector scaling
 -------------
 
-Multiplies all elements of vector x by a scalar constant alpha.
+Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.
 
 C++ API:
 ```
@@ -82,15 +86,19 @@ StatusCode CLBlastZscal(const size_t n,
                         const cl_double2 alpha,
                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHscal(const size_t n,
+                        const cl_half alpha,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SCAL:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
 * `const size_t x_offset`: The offset in elements from the start of the output x vector.
-* `const size_t x_inc`: Stride/increment of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -99,7 +107,7 @@ Arguments to SCAL:
 xCOPY: Vector copy
 -------------
 
-Copies the contents of vector x into vector y.
+Copies the contents of vector _x_ into vector _y_.
 
 C++ API:
 ```
@@ -128,17 +136,21 @@ StatusCode CLBlastZcopy(const size_t n,
                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHcopy(const size_t n,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to COPY:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -147,7 +159,7 @@ Arguments to COPY:
 xAXPY: Vector-times-constant plus vector
 -------------
 
-Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.
+Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.
 
 C++ API:
 ```
@@ -181,18 +193,23 @@ StatusCode CLBlastZaxpy(const size_t n,
                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHaxpy(const size_t n,
+                        const cl_half alpha,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to AXPY:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -201,7 +218,7 @@ Arguments to AXPY:
 xDOT: Dot product of two vectors
 -------------
 
-Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.
+Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.
 
 C++ API:
 ```
@@ -225,21 +242,26 @@ StatusCode CLBlastDdot(const size_t n,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHdot(const size_t n,
+                       cl_mem dot_buffer, const size_t dot_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to DOT:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
 * `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
 * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
 * `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -276,17 +298,17 @@ StatusCode CLBlastZdotu(const size_t n,
 
 Arguments to DOTU:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
 * `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
 * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
 * `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -323,17 +345,17 @@ StatusCode CLBlastZdotc(const size_t n,
 
 Arguments to DOTC:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
 * `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
 * `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
 * `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -342,7 +364,7 @@ Arguments to DOTC:
 xNRM2: Euclidian norm of a vector
 -------------
 
-Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.
+Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.
 
 C++ API:
 ```
@@ -371,18 +393,22 @@ StatusCode CLBlastDznrm2(const size_t n,
                         cl_mem nrm2_buffer, const size_t nrm2_offset,
                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHnrm2(const size_t n,
+                        cl_mem nrm2_buffer, const size_t nrm2_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to NRM2:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem nrm2_buffer`: OpenCL buffer to store the output nrm2 vector.
 * `const size_t nrm2_offset`: The offset in elements from the start of the output nrm2 vector.
 * `cl_mem nrm2_buffer`: OpenCL buffer to store the output nrm2 vector.
 * `const size_t nrm2_offset`: The offset in elements from the start of the output nrm2 vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -391,7 +417,7 @@ Arguments to NRM2:
 xASUM: Absolute sum of values in a vector
 -------------
 
-Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.
+Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.
 
 C++ API:
 ```
@@ -420,18 +446,22 @@ StatusCode CLBlastDzasum(const size_t n,
                         cl_mem asum_buffer, const size_t asum_offset,
                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHasum(const size_t n,
+                        cl_mem asum_buffer, const size_t asum_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to ASUM:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem asum_buffer`: OpenCL buffer to store the output asum vector.
 * `const size_t asum_offset`: The offset in elements from the start of the output asum vector.
 * `cl_mem asum_buffer`: OpenCL buffer to store the output asum vector.
 * `const size_t asum_offset`: The offset in elements from the start of the output asum vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -440,7 +470,7 @@ Arguments to ASUM:
 xSUM: Sum of values in a vector (non-BLAS function)
 -------------
 
-Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.
+Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.
 
 C++ API:
 ```
@@ -469,18 +499,22 @@ StatusCode CLBlastDzsum(const size_t n,
                        cl_mem sum_buffer, const size_t sum_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsum(const size_t n,
+                       cl_mem sum_buffer, const size_t sum_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SUM:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem sum_buffer`: OpenCL buffer to store the output sum vector.
 * `const size_t sum_offset`: The offset in elements from the start of the output sum vector.
 * `cl_mem sum_buffer`: OpenCL buffer to store the output sum vector.
 * `const size_t sum_offset`: The offset in elements from the start of the output sum vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -489,7 +523,7 @@ Arguments to SUM:
 xAMAX: Index of absolute maximum value in a vector
 -------------
 
-Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.
+Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
 
 C++ API:
 ```
@@ -518,18 +552,22 @@ StatusCode CLBlastiZamax(const size_t n,
                         cl_mem imax_buffer, const size_t imax_offset,
                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastiHamax(const size_t n,
+                        cl_mem imax_buffer, const size_t imax_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to AMAX:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector.
 * `const size_t imax_offset`: The offset in elements from the start of the output imax vector.
 * `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector.
 * `const size_t imax_offset`: The offset in elements from the start of the output imax vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -538,7 +576,7 @@ Arguments to AMAX:
 xMAX: Index of maximum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
+Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
 
 C++ API:
 ```
@@ -567,18 +605,22 @@ StatusCode CLBlastiZmax(const size_t n,
                        cl_mem imax_buffer, const size_t imax_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastiHmax(const size_t n,
+                       cl_mem imax_buffer, const size_t imax_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to MAX:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector.
 * `const size_t imax_offset`: The offset in elements from the start of the output imax vector.
 * `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector.
 * `const size_t imax_offset`: The offset in elements from the start of the output imax vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -587,7 +629,7 @@ Arguments to MAX:
 xMIN: Index of minimum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
+Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
 
 C++ API:
 ```
@@ -616,18 +658,22 @@ StatusCode CLBlastiZmin(const size_t n,
                        cl_mem imin_buffer, const size_t imin_offset,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastiHmin(const size_t n,
+                       cl_mem imin_buffer, const size_t imin_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to MIN:
 
-* `const size_t n`: Integer size argument.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector.
 * `const size_t imin_offset`: The offset in elements from the start of the output imin vector.
 * `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector.
 * `const size_t imin_offset`: The offset in elements from the start of the output imin vector.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -636,7 +682,7 @@ Arguments to MIN:
 xGEMV: General matrix-vector multiplication
 -------------
 
-Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.
+Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.
 
 C++ API:
 ```
@@ -685,34 +731,46 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
                         const cl_double2 beta,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to GEMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for GEMV:
+
+* The value of `a_ld` must be at least `m`.
+
 
 
 xGBMV: General banded matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is banded instead.
+Same operation as xGEMV, but matrix _A_ is banded instead.
 
 C++ API:
 ```
@@ -761,36 +819,48 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
                         const cl_double2 beta,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n, const size_t kl, const size_t ku,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to GBMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
-* `const size_t kl`: Integer size argument.
-* `const size_t ku`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t kl`: Integer size argument. This value must be positive.
+* `const size_t ku`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for GBMV:
+
+* The value of `a_ld` must be at least `kl + ku + 1`.
+
 
 
 xHEMV: Hermitian matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is an Hermitian matrix instead.
+Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.
 
 C++ API:
 ```
@@ -827,29 +897,33 @@ StatusCode CLBlastZhemv(const Layout layout, const Triangle triangle,
 
 Arguments to HEMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HEMV:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xHBMV: Hermitian banded matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.
+Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.
 
 C++ API:
 ```
@@ -886,30 +960,34 @@ StatusCode CLBlastZhbmv(const Layout layout, const Triangle triangle,
 
 Arguments to HBMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HBMV:
+
+* The value of `a_ld` must be at least `k + 1`.
+
 
 
 xHPMV: Hermitian packed matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.
+Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.
 
 C++ API:
 ```
@@ -946,19 +1024,19 @@ StatusCode CLBlastZhpmv(const Layout layout, const Triangle triangle,
 
 Arguments to HPMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the input AP matrix.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -967,7 +1045,7 @@ Arguments to HPMV:
 xSYMV: Symmetric matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is symmetric instead.
+Same operation as xGEMV, but matrix _A_ is symmetric instead.
 
 C++ API:
 ```
@@ -1000,33 +1078,45 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
                         const double beta,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SYMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SYMV:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xSBMV: Symmetric banded matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is symmetric and banded instead.
+Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.
 
 C++ API:
 ```
@@ -1059,34 +1149,46 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
                         const double beta,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
+                        const size_t n, const size_t k,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SBMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SBMV:
+
+* The value of `a_ld` must be at least `k + 1`.
+
 
 
 xSPMV: Symmetric packed matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.
+Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.
 
 C++ API:
 ```
@@ -1119,23 +1221,31 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
                         const double beta,
                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem ap_buffer, const size_t ap_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SPMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the input AP matrix.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
 * `const size_t y_offset`: The offset in elements from the start of the output y vector.
-* `const size_t y_inc`: Stride/increment of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -1144,7 +1254,7 @@ Arguments to SPMV:
 xTRMV: Triangular matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is triangular instead.
+Same operation as xGEMV, but matrix _A_ is triangular instead.
 
 C++ API:
 ```
@@ -1178,30 +1288,39 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran
                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t n,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to TRMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
 * `const size_t x_offset`: The offset in elements from the start of the output x vector.
-* `const size_t x_inc`: Stride/increment of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for TRMV:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xTBMV: Triangular banded matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is triangular and banded instead.
+Same operation as xGEMV, but matrix _A_ is triangular and banded instead.
 
 C++ API:
 ```
@@ -1235,31 +1354,40 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran
                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t n, const size_t k,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to TBMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
 * `const size_t x_offset`: The offset in elements from the start of the output x vector.
-* `const size_t x_inc`: Stride/increment of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for TBMV:
+
+* The value of `a_ld` must be at least `k + 1`.
+
 
 
 xTPMV: Triangular packed matrix-vector multiplication
 -------------
 
-Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.
+Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.
 
 C++ API:
 ```
@@ -1293,20 +1421,25 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran
                         const cl_mem ap_buffer, const size_t ap_offset,
                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t n,
+                        const cl_mem ap_buffer, const size_t ap_offset,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to TPMV:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the input AP matrix.
 * `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
 * `const size_t x_offset`: The offset in elements from the start of the output x vector.
-* `const size_t x_inc`: Stride/increment of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
@@ -1315,7 +1448,7 @@ Arguments to TPMV:
 xGER: General rank-1 matrix update
 -------------
 
-
+Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.
 
 C++ API:
 ```
@@ -1345,32 +1478,43 @@ StatusCode CLBlastDger(const Layout layout,
                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHger(const Layout layout,
+                       const size_t m, const size_t n,
+                       const cl_half alpha,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                       cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to GER:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for GER:
+
+* The value of `a_ld` must be at least `m`.
+
 
 
 xGERU: General rank-1 complex matrix update
 -------------
 
-
+Same operation as xGER, but with complex data-types.
 
 C++ API:
 ```
@@ -1404,28 +1548,32 @@ StatusCode CLBlastZgeru(const Layout layout,
 
 Arguments to GERU:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for GERU:
+
+* The value of `a_ld` must be at least `m`.
+
 
 
 xGERC: General rank-1 complex conjugated matrix update
 -------------
 
-
+Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.
 
 C++ API:
 ```
@@ -1459,28 +1607,32 @@ StatusCode CLBlastZgerc(const Layout layout,
 
 Arguments to GERC:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for GERC:
+
+* The value of `a_ld` must be at least `m`.
+
 
 
 xHER: Hermitian rank-1 matrix update
 -------------
 
-
+Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.
 
 C++ API:
 ```
@@ -1511,25 +1663,29 @@ StatusCode CLBlastZher(const Layout layout, const Triangle triangle,
 
 Arguments to HER:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HER:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xHPR: Hermitian packed rank-1 matrix update
 -------------
 
-
+Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.
 
 C++ API:
 ```
@@ -1560,13 +1716,13 @@ StatusCode CLBlastZhpr(const Layout layout, const Triangle triangle,
 
 Arguments to HPR:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
@@ -1577,7 +1733,7 @@ Arguments to HPR:
 xHER2: Hermitian rank-2 matrix update
 -------------
 
-
+Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.
 
 C++ API:
 ```
@@ -1611,28 +1767,32 @@ StatusCode CLBlastZher2(const Layout layout, const Triangle triangle,
 
 Arguments to HER2:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HER2:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xHPR2: Hermitian packed rank-2 matrix update
 -------------
 
-
+Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.
 
 C++ API:
 ```
@@ -1666,16 +1826,16 @@ StatusCode CLBlastZhpr2(const Layout layout, const Triangle triangle,
 
 Arguments to HPR2:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
@@ -1686,7 +1846,7 @@ Arguments to HPR2:
 xSYR: Symmetric rank-1 matrix update
 -------------
 
-
+Same operation as xHER, but matrix A is a symmetric matrix instead.
 
 C++ API:
 ```
@@ -1713,29 +1873,39 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
+                       const size_t n,
+                       const cl_half alpha,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SYR:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SYR:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xSPR: Symmetric packed rank-1 matrix update
 -------------
 
-
+Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.
 
 C++ API:
 ```
@@ -1762,17 +1932,23 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                        cl_mem ap_buffer, const size_t ap_offset,
                        cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
+                       const size_t n,
+                       const cl_half alpha,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_mem ap_buffer, const size_t ap_offset,
+                       cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SPR:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
@@ -1783,7 +1959,7 @@ Arguments to SPR:
 xSYR2: Symmetric rank-2 matrix update
 -------------
 
-
+Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.
 
 C++ API:
 ```
@@ -1813,32 +1989,43 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SYR2:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the output A matrix.
-* `const size_t a_ld`: Leading dimension of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SYR2:
+
+* The value of `a_ld` must be at least `n`.
+
 
 
 xSPR2: Symmetric packed rank-2 matrix update
 -------------
 
-
+Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.
 
 C++ API:
 ```
@@ -1868,20 +2055,27 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_mem ap_buffer, const size_t ap_offset,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_mem ap_buffer, const size_t ap_offset,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SPR2:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
 * `const size_t x_offset`: The offset in elements from the start of the input x vector.
-* `const size_t x_inc`: Stride/increment of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
 * `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
 * `const size_t y_offset`: The offset in elements from the start of the input y vector.
-* `const size_t y_inc`: Stride/increment of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
 * `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
 * `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
@@ -1892,7 +2086,7 @@ Arguments to SPR2:
 xGEMM: General matrix-matrix multiplication
 -------------
 
-
+Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.
 
 C++ API:
 ```
@@ -1941,36 +2135,50 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const
                         const cl_double2 beta,
                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                        const size_t m, const size_t n, const size_t k,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        const cl_half beta,
+                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to GEMM:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const Transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
 * `const size_t b_offset`: The offset in elements from the start of the input B matrix.
-* `const size_t b_ld`: Leading dimension of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for GEMM:
+
+* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
+* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
+* The value of `c_ld` must be at least `m`.
+
 
 
 xSYMM: Symmetric matrix-matrix multiplication
 -------------
 
-
+Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.
 
 C++ API:
 ```
@@ -2019,35 +2227,49 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri
                         const cl_double2 beta,
                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        const cl_half beta,
+                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SYMM:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Side`: The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
 * `const size_t b_offset`: The offset in elements from the start of the input B matrix.
-* `const size_t b_ld`: Leading dimension of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SYMM:
+
+* When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`.
+* The value of `b_ld` must be at least `m`.
+* The value of `c_ld` must be at least `m`.
+
 
 
 xHEMM: Hermitian matrix-matrix multiplication
 -------------
 
-
+Same operation as xSYMM, but _A_ is an Hermitian matrix instead.
 
 C++ API:
 ```
@@ -2084,31 +2306,37 @@ StatusCode CLBlastZhemm(const Layout layout, const Side side, const Triangle tri
 
 Arguments to HEMM:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Side`: The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
 * `const size_t b_offset`: The offset in elements from the start of the input B matrix.
-* `const size_t b_ld`: Leading dimension of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HEMM:
+
+* When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`.
+* The value of `b_ld` must be at least `m`.
+* The value of `c_ld` must be at least `m`.
+
 
 
 xSYRK: Rank-K update of a symmetric matrix
 -------------
 
-
+Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.
 
 C++ API:
 ```
@@ -2152,32 +2380,44 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran
                         const cl_double2 beta,
                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                        const size_t n, const size_t k,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_half beta,
+                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SYRK:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SYRK:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* The value of `c_ld` must be at least `m`.
+
 
 
 xHERK: Rank-K update of a hermitian matrix
 -------------
 
-
+Same operation as xSYRK, but _C_ is an Hermitian matrix instead.
 
 C++ API:
 ```
@@ -2211,28 +2451,33 @@ StatusCode CLBlastZherk(const Layout layout, const Triangle triangle, const Tran
 
 Arguments to HERK:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HERK:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* The value of `c_ld` must be at least `m`.
+
 
 
 xSYR2K: Rank-2K update of a symmetric matrix
 -------------
 
-
+Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.
 
 C++ API:
 ```
@@ -2281,35 +2526,49 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra
                          const cl_double2 beta,
                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                         const size_t n, const size_t k,
+                         const cl_half alpha,
+                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const cl_half beta,
+                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to SYR2K:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose ab_transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
 * `const size_t b_offset`: The offset in elements from the start of the input B matrix.
-* `const size_t b_ld`: Leading dimension of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
 * `const T beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for SYR2K:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`.
+* The value of `c_ld` must be at least `n`.
+
 
 
 xHER2K: Rank-2K update of a hermitian matrix
 -------------
 
-
+Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.
 
 C++ API:
 ```
@@ -2346,31 +2605,37 @@ StatusCode CLBlastZher2k(const Layout layout, const Triangle triangle, const Tra
 
 Arguments to HER2K:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const size_t n`: Integer size argument.
-* `const size_t k`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose ab_transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
 * `const size_t b_offset`: The offset in elements from the start of the input B matrix.
-* `const size_t b_ld`: Leading dimension of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
 * `const U beta`: Input scalar constant.
 * `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
 * `const size_t c_offset`: The offset in elements from the start of the output C matrix.
-* `const size_t c_ld`: Leading dimension of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for HER2K:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`.
+* The value of `c_ld` must be at least `n`.
+
 
 
 xTRMM: Triangular matrix-matrix multiplication
 -------------
 
-
+Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.
 
 C++ API:
 ```
@@ -2409,26 +2674,110 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri
                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                         cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        cl_command_queue* queue, cl_event* event)
 ```
 
 Arguments to TRMM:
 
-* `const Layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
-* `const Side`: The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).
-* `const Triangle`: The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
-* `const Transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
-* `const Diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.
-* `const size_t m`: Integer size argument.
-* `const size_t n`: Integer size argument.
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
 * `const T alpha`: Input scalar constant.
 * `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
 * `const size_t a_offset`: The offset in elements from the start of the input A matrix.
-* `const size_t a_ld`: Leading dimension of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
 * `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
 * `const size_t b_offset`: The offset in elements from the start of the output B matrix.
-* `const size_t b_ld`: Leading dimension of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
 * `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
 * `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
 
+Requirements for TRMM:
+
+* When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`.
+* The value of `b_ld` must be at least `m`.
+
+
+
+xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
+-------------
+
+Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.
+
+C++ API:
+```
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const float alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const double alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_float2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_double2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_half alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to OMATCOPY:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for OMATCOPY:
+
+* The value of `a_ld` must be at least `m`.
+* The value of `b_ld` must be at least `n`.
+
 
 
diff --git a/include/clblast.h b/include/clblast.h
index 5df0f605..c8596b39 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -68,8 +68,8 @@ enum class StatusCode {
   kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
   kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
   kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
-  kInvalidVectorDot          = -2043, // Vector dot is not a valid OpenCL buffer
-  kInsufficientMemoryDot     = -2042, // Vector dot's OpenCL buffer is too small
+  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
+  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
 };
 
 // Matrix layout and transpose types
@@ -121,28 +121,28 @@ StatusCode Rotm(const size_t n,
                 cl_mem sparam_buffer, const size_t sparam_offset,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
 template <typename T>
 StatusCode Swap(const size_t n,
                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
 template <typename T>
 StatusCode Scal(const size_t n,
                 const T alpha,
                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
 template <typename T>
 StatusCode Copy(const size_t n,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
 template <typename T>
 StatusCode Axpy(const size_t n,
                 const T alpha,
@@ -150,7 +150,7 @@ StatusCode Axpy(const size_t n,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Dot product of two vectors: SDOT/DDOT
+// Dot product of two vectors: SDOT/DDOT/HDOT
 template <typename T>
 StatusCode Dot(const size_t n,
                cl_mem dot_buffer, const size_t dot_offset,
@@ -174,42 +174,42 @@ StatusCode Dotc(const size_t n,
                 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
 template <typename T>
 StatusCode Nrm2(const size_t n,
                 cl_mem nrm2_buffer, const size_t nrm2_offset,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
 template <typename T>
 StatusCode Asum(const size_t n,
                 cl_mem asum_buffer, const size_t asum_offset,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
 template <typename T>
 StatusCode Sum(const size_t n,
                cl_mem sum_buffer, const size_t sum_offset,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_command_queue* queue, cl_event* event = nullptr);
 
-// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 template <typename T>
 StatusCode Amax(const size_t n,
                 cl_mem imax_buffer, const size_t imax_offset,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
 template <typename T>
 StatusCode Max(const size_t n,
                cl_mem imax_buffer, const size_t imax_offset,
                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                cl_command_queue* queue, cl_event* event = nullptr);
 
-// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
 template <typename T>
 StatusCode Min(const size_t n,
                cl_mem imin_buffer, const size_t imin_offset,
@@ -220,7 +220,7 @@ StatusCode Min(const size_t n,
 // BLAS level-2 (matrix-vector) routines
 // =================================================================================================
 
-// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
 template <typename T>
 StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                 const size_t m, const size_t n,
@@ -231,7 +231,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
 template <typename T>
 StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
                 const size_t m, const size_t n, const size_t kl, const size_t ku,
@@ -275,7 +275,7 @@ StatusCode Hpmv(const Layout layout, const Triangle triangle,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric matrix-vector multiplication: SSYMV/DSYMV
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
 template <typename T>
 StatusCode Symv(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -286,7 +286,7 @@ StatusCode Symv(const Layout layout, const Triangle triangle,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
 template <typename T>
 StatusCode Sbmv(const Layout layout, const Triangle triangle,
                 const size_t n, const size_t k,
@@ -297,7 +297,7 @@ StatusCode Sbmv(const Layout layout, const Triangle triangle,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
 template <typename T>
 StatusCode Spmv(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -308,7 +308,7 @@ StatusCode Spmv(const Layout layout, const Triangle triangle,
                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
 template <typename T>
 StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t n,
@@ -316,7 +316,7 @@ StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_
                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
 template <typename T>
 StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t n, const size_t k,
@@ -324,7 +324,7 @@ StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_
                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
 template <typename T>
 StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t n,
@@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// General rank-1 matrix update: SGER/DGER
+// General rank-1 matrix update: SGER/DGER/HGER
 template <typename T>
 StatusCode Ger(const Layout layout,
                const size_t m, const size_t n,
@@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
                 cl_mem ap_buffer, const size_t ap_offset,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric rank-1 matrix update: SSYR/DSYR
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
 template <typename T>
 StatusCode Syr(const Layout layout, const Triangle triangle,
                const size_t n,
@@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric packed rank-1 matrix update: SSPR/DSPR
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
 template <typename T>
 StatusCode Spr(const Layout layout, const Triangle triangle,
                const size_t n,
@@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
                cl_mem ap_buffer, const size_t ap_offset,
                cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric rank-2 matrix update: SSYR2/DSYR2
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
 template <typename T>
 StatusCode Syr2(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
                 cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
 template <typename T>
 StatusCode Spr2(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -466,7 +466,7 @@ StatusCode Spr2(const Layout layout, const Triangle triangle,
 // BLAS level-3 (matrix-matrix) routines
 // =================================================================================================
 
-// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
 template <typename T>
 StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                 const size_t m, const size_t n, const size_t k,
@@ -477,7 +477,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                 const size_t m, const size_t n,
@@ -499,7 +499,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
 template <typename T>
 StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                 const size_t n, const size_t k,
@@ -519,7 +519,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
 template <typename T>
 StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                  const size_t n, const size_t k,
@@ -541,7 +541,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
                  cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                  cl_command_queue* queue, cl_event* event = nullptr);
 
-// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
 template <typename T>
 StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t m, const size_t n,
@@ -550,7 +550,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, c
                 cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                 cl_command_queue* queue, cl_event* event = nullptr);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
 template <typename T>
 StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t m, const size_t n,
@@ -560,6 +560,19 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
                 cl_command_queue* queue, cl_event* event = nullptr);
 
 // =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
 
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
 // for the same device. This cache can be cleared to free up system memory or in case of debugging.
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 8b2bf73c..b92febac 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -77,8 +77,8 @@ typedef enum StatusCode_ {
   kInvalidLocalMemUsage      = -2046, // Not enough local memory available on this device
   kNoHalfPrecision           = -2045, // Half precision (16-bits) not supported by the device
   kNoDoublePrecision         = -2044, // Double precision (64-bits) not supported by the device
-  kInvalidVectorDot          = -2043, // Vector dot is not a valid OpenCL buffer
-  kInsufficientMemoryDot     = -2042, // Vector dot's OpenCL buffer is too small
+  kInvalidVectorScalar       = -2043, // The unit-sized vector is not a valid OpenCL buffer
+  kInsufficientMemoryScalar  = -2042, // The unit-sized vector's OpenCL buffer is too small
 } StatusCode;
 
 // Matrix layout and transpose types
@@ -148,7 +148,7 @@ StatusCode PUBLIC_API CLBlastDrotm(const size_t n,
                                    cl_mem sparam_buffer, const size_t sparam_offset,
                                    cl_command_queue* queue, cl_event* event);
 
-// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
 StatusCode PUBLIC_API CLBlastSswap(const size_t n,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
@@ -165,8 +165,12 @@ StatusCode PUBLIC_API CLBlastZswap(const size_t n,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHswap(const size_t n,
+                                   cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
 StatusCode PUBLIC_API CLBlastSscal(const size_t n,
                                    const float alpha,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -183,8 +187,12 @@ StatusCode PUBLIC_API CLBlastZscal(const size_t n,
                                    const cl_double2 alpha,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHscal(const size_t n,
+                                   const cl_half alpha,
+                                   cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
 StatusCode PUBLIC_API CLBlastScopy(const size_t n,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
@@ -201,8 +209,12 @@ StatusCode PUBLIC_API CLBlastZcopy(const size_t n,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHcopy(const size_t n,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
 StatusCode PUBLIC_API CLBlastSaxpy(const size_t n,
                                    const float alpha,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -223,8 +235,13 @@ StatusCode PUBLIC_API CLBlastZaxpy(const size_t n,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHaxpy(const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Dot product of two vectors: SDOT/DDOT
+// Dot product of two vectors: SDOT/DDOT/HDOT
 StatusCode PUBLIC_API CLBlastSdot(const size_t n,
                                   cl_mem dot_buffer, const size_t dot_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -235,6 +252,11 @@ StatusCode PUBLIC_API CLBlastDdot(const size_t n,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHdot(const size_t n,
+                                  cl_mem dot_buffer, const size_t dot_offset,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                  cl_command_queue* queue, cl_event* event);
 
 // Dot product of two complex vectors: CDOTU/ZDOTU
 StatusCode PUBLIC_API CLBlastCdotu(const size_t n,
@@ -260,7 +282,7 @@ StatusCode PUBLIC_API CLBlastZdotc(const size_t n,
                                    const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
 
-// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
 StatusCode PUBLIC_API CLBlastSnrm2(const size_t n,
                                    cl_mem nrm2_buffer, const size_t nrm2_offset,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -277,8 +299,12 @@ StatusCode PUBLIC_API CLBlastDznrm2(const size_t n,
                                    cl_mem nrm2_buffer, const size_t nrm2_offset,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHnrm2(const size_t n,
+                                   cl_mem nrm2_buffer, const size_t nrm2_offset,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
 StatusCode PUBLIC_API CLBlastSasum(const size_t n,
                                    cl_mem asum_buffer, const size_t asum_offset,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -295,8 +321,12 @@ StatusCode PUBLIC_API CLBlastDzasum(const size_t n,
                                    cl_mem asum_buffer, const size_t asum_offset,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHasum(const size_t n,
+                                   cl_mem asum_buffer, const size_t asum_offset,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
 StatusCode PUBLIC_API CLBlastSsum(const size_t n,
                                   cl_mem sum_buffer, const size_t sum_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -313,8 +343,12 @@ StatusCode PUBLIC_API CLBlastDzsum(const size_t n,
                                   cl_mem sum_buffer, const size_t sum_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsum(const size_t n,
+                                  cl_mem sum_buffer, const size_t sum_offset,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  cl_command_queue* queue, cl_event* event);
 
-// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 StatusCode PUBLIC_API CLBlastiSamax(const size_t n,
                                    cl_mem imax_buffer, const size_t imax_offset,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -331,8 +365,12 @@ StatusCode PUBLIC_API CLBlastiZamax(const size_t n,
                                    cl_mem imax_buffer, const size_t imax_offset,
                                    const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastiHamax(const size_t n,
+                                   cl_mem imax_buffer, const size_t imax_offset,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
 StatusCode PUBLIC_API CLBlastiSmax(const size_t n,
                                   cl_mem imax_buffer, const size_t imax_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -349,8 +387,12 @@ StatusCode PUBLIC_API CLBlastiZmax(const size_t n,
                                   cl_mem imax_buffer, const size_t imax_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastiHmax(const size_t n,
+                                  cl_mem imax_buffer, const size_t imax_offset,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  cl_command_queue* queue, cl_event* event);
 
-// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
 StatusCode PUBLIC_API CLBlastiSmin(const size_t n,
                                   cl_mem imin_buffer, const size_t imin_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -367,12 +409,16 @@ StatusCode PUBLIC_API CLBlastiZmin(const size_t n,
                                   cl_mem imin_buffer, const size_t imin_offset,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastiHmin(const size_t n,
+                                  cl_mem imin_buffer, const size_t imin_offset,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  cl_command_queue* queue, cl_event* event);
 
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
 // =================================================================================================
 
-// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
 StatusCode PUBLIC_API CLBlastSgemv(const Layout layout, const Transpose a_transpose,
                                    const size_t m, const size_t n,
                                    const float alpha,
@@ -405,8 +451,16 @@ StatusCode PUBLIC_API CLBlastZgemv(const Layout layout, const Transpose a_transp
                                    const cl_double2 beta,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHgemv(const Layout layout, const Transpose a_transpose,
+                                   const size_t m, const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_half beta,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
 StatusCode PUBLIC_API CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
                                    const size_t m, const size_t n, const size_t kl, const size_t ku,
                                    const float alpha,
@@ -439,6 +493,14 @@ StatusCode PUBLIC_API CLBlastZgbmv(const Layout layout, const Transpose a_transp
                                    const cl_double2 beta,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
+                                   const size_t m, const size_t n, const size_t kl, const size_t ku,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_half beta,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
 // Hermitian matrix-vector multiplication: CHEMV/ZHEMV
 StatusCode PUBLIC_API CLBlastChemv(const Layout layout, const Triangle triangle,
@@ -494,7 +556,7 @@ StatusCode PUBLIC_API CLBlastZhpmv(const Layout layout, const Triangle triangle,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
 
-// Symmetric matrix-vector multiplication: SSYMV/DSYMV
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
 StatusCode PUBLIC_API CLBlastSsymv(const Layout layout, const Triangle triangle,
                                    const size_t n,
                                    const float alpha,
@@ -511,8 +573,16 @@ StatusCode PUBLIC_API CLBlastDsymv(const Layout layout, const Triangle triangle,
                                    const double beta,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsymv(const Layout layout, const Triangle triangle,
+                                   const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_half beta,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
 StatusCode PUBLIC_API CLBlastSsbmv(const Layout layout, const Triangle triangle,
                                    const size_t n, const size_t k,
                                    const float alpha,
@@ -529,8 +599,16 @@ StatusCode PUBLIC_API CLBlastDsbmv(const Layout layout, const Triangle triangle,
                                    const double beta,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsbmv(const Layout layout, const Triangle triangle,
+                                   const size_t n, const size_t k,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_half beta,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
 StatusCode PUBLIC_API CLBlastSspmv(const Layout layout, const Triangle triangle,
                                    const size_t n,
                                    const float alpha,
@@ -547,8 +625,16 @@ StatusCode PUBLIC_API CLBlastDspmv(const Layout layout, const Triangle triangle,
                                    const double beta,
                                    cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHspmv(const Layout layout, const Triangle triangle,
+                                   const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem ap_buffer, const size_t ap_offset,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_half beta,
+                                   cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
 StatusCode PUBLIC_API CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                                    const size_t n,
                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@@ -569,8 +655,13 @@ StatusCode PUBLIC_API CLBlastZtrmv(const Layout layout, const Triangle triangle,
                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                                   const size_t n,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
 StatusCode PUBLIC_API CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                                    const size_t n, const size_t k,
                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
@@ -591,8 +682,13 @@ StatusCode PUBLIC_API CLBlastZtbmv(const Layout layout, const Triangle triangle,
                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                                   const size_t n, const size_t k,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
 StatusCode PUBLIC_API CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                                    const size_t n,
                                    const cl_mem ap_buffer, const size_t ap_offset,
@@ -613,6 +709,11 @@ StatusCode PUBLIC_API CLBlastZtpmv(const Layout layout, const Triangle triangle,
                                    const cl_mem ap_buffer, const size_t ap_offset,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                                   const size_t n,
+                                   const cl_mem ap_buffer, const size_t ap_offset,
+                                   cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   cl_command_queue* queue, cl_event* event);
 
 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 StatusCode PUBLIC_API CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -680,7 +781,7 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle,
                                    cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                    cl_command_queue* queue, cl_event* event);
 
-// General rank-1 matrix update: SGER/DGER
+// General rank-1 matrix update: SGER/DGER/HGER
 StatusCode PUBLIC_API CLBlastSger(const Layout layout,
                                   const size_t m, const size_t n,
                                   const float alpha,
@@ -695,6 +796,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout,
                                   const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                   cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHger(const Layout layout,
+                                  const size_t m, const size_t n,
+                                  const cl_half alpha,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                  cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                  cl_command_queue* queue, cl_event* event);
 
 // General rank-1 complex matrix update: CGERU/ZGERU
 StatusCode PUBLIC_API CLBlastCgeru(const Layout layout,
@@ -788,7 +896,7 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle,
                                    cl_mem ap_buffer, const size_t ap_offset,
                                    cl_command_queue* queue, cl_event* event);
 
-// Symmetric rank-1 matrix update: SSYR/DSYR
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
 StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle,
                                   const size_t n,
                                   const float alpha,
@@ -801,8 +909,14 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle,
+                                  const size_t n,
+                                  const cl_half alpha,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                  cl_command_queue* queue, cl_event* event);
 
-// Symmetric packed rank-1 matrix update: SSPR/DSPR
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
 StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle,
                                   const size_t n,
                                   const float alpha,
@@ -815,8 +929,14 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle,
                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_mem ap_buffer, const size_t ap_offset,
                                   cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle,
+                                  const size_t n,
+                                  const cl_half alpha,
+                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  cl_mem ap_buffer, const size_t ap_offset,
+                                  cl_command_queue* queue, cl_event* event);
 
-// Symmetric rank-2 matrix update: SSYR2/DSYR2
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
 StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle,
                                    const size_t n,
                                    const float alpha,
@@ -831,8 +951,15 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle,
                                    const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle,
+                                   const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
 StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle,
                                    const size_t n,
                                    const float alpha,
@@ -847,12 +974,19 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle,
                                    const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
                                    cl_mem ap_buffer, const size_t ap_offset,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle,
+                                   const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                   const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                   cl_mem ap_buffer, const size_t ap_offset,
+                                   cl_command_queue* queue, cl_event* event);
 
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
 // =================================================================================================
 
-// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
 StatusCode PUBLIC_API CLBlastSgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                                    const size_t m, const size_t n, const size_t k,
                                    const float alpha,
@@ -885,8 +1019,16 @@ StatusCode PUBLIC_API CLBlastZgemm(const Layout layout, const Transpose a_transp
                                    const cl_double2 beta,
                                    cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                                   const size_t m, const size_t n, const size_t k,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   const cl_half beta,
+                                   cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
 StatusCode PUBLIC_API CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
                                    const size_t m, const size_t n,
                                    const float alpha,
@@ -919,6 +1061,14 @@ StatusCode PUBLIC_API CLBlastZsymm(const Layout layout, const Side side, const T
                                    const cl_double2 beta,
                                    cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
+                                   const size_t m, const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   const cl_half beta,
+                                   cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                   cl_command_queue* queue, cl_event* event);
 
 // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
 StatusCode PUBLIC_API CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
@@ -938,7 +1088,7 @@ StatusCode PUBLIC_API CLBlastZhemm(const Layout layout, const Side side, const T
                                    cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                    cl_command_queue* queue, cl_event* event);
 
-// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
 StatusCode PUBLIC_API CLBlastSsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                                    const size_t n, const size_t k,
                                    const float alpha,
@@ -967,6 +1117,13 @@ StatusCode PUBLIC_API CLBlastZsyrk(const Layout layout, const Triangle triangle,
                                    const cl_double2 beta,
                                    cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                                   const size_t n, const size_t k,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   const cl_half beta,
+                                   cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                   cl_command_queue* queue, cl_event* event);
 
 // Rank-K update of a hermitian matrix: CHERK/ZHERK
 StatusCode PUBLIC_API CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
@@ -984,7 +1141,7 @@ StatusCode PUBLIC_API CLBlastZherk(const Layout layout, const Triangle triangle,
                                    cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                    cl_command_queue* queue, cl_event* event);
 
-// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
 StatusCode PUBLIC_API CLBlastSsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                                     const size_t n, const size_t k,
                                     const float alpha,
@@ -1017,6 +1174,14 @@ StatusCode PUBLIC_API CLBlastZsyr2k(const Layout layout, const Triangle triangle
                                     const cl_double2 beta,
                                     cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                     cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                                    const size_t n, const size_t k,
+                                    const cl_half alpha,
+                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                    const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                    const cl_half beta,
+                                    cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                    cl_command_queue* queue, cl_event* event);
 
 // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
 StatusCode PUBLIC_API CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
@@ -1036,7 +1201,7 @@ StatusCode PUBLIC_API CLBlastZher2k(const Layout layout, const Triangle triangle
                                     cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
                                     cl_command_queue* queue, cl_event* event);
 
-// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
 StatusCode PUBLIC_API CLBlastStrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                                    const size_t m, const size_t n,
                                    const float alpha,
@@ -1061,8 +1226,14 @@ StatusCode PUBLIC_API CLBlastZtrmm(const Layout layout, const Side side, const T
                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                                   const size_t m, const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
 StatusCode PUBLIC_API CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                                    const size_t m, const size_t n,
                                    const float alpha,
@@ -1087,6 +1258,48 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T
                                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
                                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
                                    cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                                   const size_t m, const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+StatusCode PUBLIC_API CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const float alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const double alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const cl_float2 alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const cl_double2 alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const cl_half alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
 
 // =================================================================================================
 
diff --git a/include/clblast_half.h b/include/clblast_half.h
new file mode 100644
index 00000000..269a520e
--- /dev/null
+++ b/include/clblast_half.h
@@ -0,0 +1,256 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides simple conversion operations between fp16 (half) and fp32 (float). These
+// conversion functions are based on ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf and
+// are also part of the C++ half-precision header (http://half.sourceforge.net/).
+//
+// This file is pure C99.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_HALF_H_
+#define CLBLAST_HALF_H_
+
+// Includes the normal OpenCL C header
+#if defined(__APPLE__) || defined(__MACOSX)
+  #include <OpenCL/opencl.h>
+#else
+  #include <CL/opencl.h>
+#endif
+
+// =================================================================================================
+
+// Host data-type for half-precision floating-point (16-bit). This is based on the OpenCL type,
+// which is a typedef for unsigned short.
+typedef cl_half half;
+
+// 32-bit union for conversions
+typedef union ConversionBits_ {
+  unsigned int i32;
+  float f32;
+} ConversionBits;
+
+// =================================================================================================
+
+// Converts a IEEE-compliant single-precision value to half-precision floating-point. This function
+// applies simple truncation (round toward zero, but with overflows set to infinity) as rounding
+// mode.
+inline half FloatToHalf(const float value) {
+  static const unsigned short base_table[512] = { 
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+    0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+    0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+    0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+    0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00,
+    0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00
+  };
+  static const unsigned char shift_table[512] = { 
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13
+  };
+  ConversionBits bits;
+  bits.f32 = value;
+  const unsigned short halfbits = base_table[bits.i32 >> 23] +
+                                  (unsigned short)((bits.i32 & 0x7FFFFF) >> shift_table[bits.i32 >> 23]);
+  return halfbits;
+}
+
+// Converts a half-precision value to IEEE-compliant single-precision floating-point
+inline float HalfToFloat(const half value) {
+  static const unsigned int mantissa_table[2048] = { 
+    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
+    0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
+    0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
+    0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
+    0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
+    0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
+    0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
+    0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
+    0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
+    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
+    0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
+    0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
+    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
+    0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
+    0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
+    0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
+    0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
+    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
+    0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
+    0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
+    0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
+    0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
+    0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
+    0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
+    0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
+    0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
+    0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
+    0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
+    0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
+    0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
+    0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
+    0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
+    0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
+    0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+    0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
+    0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
+    0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
+    0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
+    0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
+    0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
+    0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
+    0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
+    0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
+    0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
+    0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
+    0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
+    0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
+    0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
+    0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
+    0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
+    0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
+    0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
+    0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
+    0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
+    0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
+    0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
+    0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
+    0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
+    0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+    0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
+    0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
+    0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
+    0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
+    0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
+    0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
+    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
+    0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
+    0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
+    0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+    0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
+    0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
+    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
+    0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
+    0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
+    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
+    0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
+    0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
+    0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
+    0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
+    0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
+    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
+    0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
+    0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
+    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
+    0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
+    0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
+    0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
+    0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
+    0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
+    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
+    0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
+    0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
+    0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
+    0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
+    0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
+    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
+    0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
+    0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
+    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
+    0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
+    0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
+    0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
+    0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
+    0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
+    0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
+    0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
+    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
+    0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
+    0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
+    0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
+    0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
+    0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
+    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
+    0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
+    0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
+    0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
+    0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
+    0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
+    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
+    0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
+    0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
+    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
+    0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
+    0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
+    0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
+    0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000
+  };
+  static const unsigned int exponent_table[64] = { 
+    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
+    0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
+    0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000
+  };
+  static const unsigned short offset_table[64] = { 
+    0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+    0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
+  };
+  ConversionBits bits;
+  bits.i32 = mantissa_table[offset_table[value >> 10] + (value & 0x3FF)] +
+             exponent_table[value >> 10];
+  return bits.f32;
+}
+
+// =================================================================================================
+
+// CLBLAST_HALF_H_
+#endif
diff --git a/include/internal/routine.h b/include/internal/routine.h
deleted file mode 100644
index 6df186c5..00000000
--- a/include/internal/routine.h
+++ /dev/null
@@ -1,143 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements all the basic functionality for the BLAS routines. This class serves as a
-// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
-// compiling the OpenCL kernel, connecting to the database, etc.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_ROUTINE_H_
-#define CLBLAST_ROUTINE_H_
-
-#include <string>
-#include <vector>
-
-#include "internal/cache.h"
-#include "internal/utilities.h"
-#include "internal/database.h"
-
-namespace clblast {
-// =================================================================================================
-
-// See comment at top of file for a description of the class
-template <typename T>
-class Routine {
- public:
-
-  // Helper functions which check for errors in the status code
-  static constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
-
-  // Base class constructor
-  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
-                   const std::vector<std::string> &routines, const Precision precision);
-
-  // Set-up phase of the kernel
-  StatusCode SetUp();
-
- protected:
-  
-  // Runs a kernel given the global and local thread sizes
-  StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                       const std::vector<size_t> &local, EventPointer event,
-                       std::vector<Event>& waitForEvents);
-
-  // As above, but without an event waiting list
-  StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                       const std::vector<size_t> &local, EventPointer event);
-
-  // Tests for valid inputs of matrices A, B, and C
-  StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                         const size_t offset, const size_t ld, const size_t data_size);
-  StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                         const size_t offset, const size_t ld, const size_t data_size);
-  StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                         const size_t offset, const size_t ld, const size_t data_size);
-  StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer,
-                          const size_t offset, const size_t data_size);
-
-  // Tests for valid inputs of vector X and Y
-  StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                         const size_t inc, const size_t data_size);
-  StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                         const size_t inc, const size_t data_size);
-
-  // Tests for valid inputs of other vectors
-  StatusCode TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                           const size_t data_size);
-  StatusCode TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
-                             const size_t offset, const size_t data_size);
-
-  // Copies/transposes a matrix and padds/unpads it with zeroes. This method is also able to write
-  // to symmetric and triangular matrices through optional arguments.
-  StatusCode PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
-                                    const size_t src_one, const size_t src_two,
-                                    const size_t src_ld, const size_t src_offset,
-                                    const Buffer<T> &src,
-                                    const size_t dest_one, const size_t dest_two,
-                                    const size_t dest_ld, const size_t dest_offset,
-                                    const Buffer<T> &dest,
-                                    const Program &program, const bool do_pad,
-                                    const bool do_transpose, const bool do_conjugate,
-                                    const bool upper = false, const bool lower = false,
-                                    const bool diagonal_imag_zero = false);
-
-  // Stores a newly compiled binary/program into the cache
-  void StoreBinaryToCache(const std::string& binary) const {
-    cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
-  }
-  void StoreProgramToCache(const Program& program) const {
-    cache::StoreProgramToCache(program, context_, precision_, routine_name_);
-  }
-
-  // Queries the cache and retrieve either a matching binary/program or a boolean whether a match
-  // exists. The first assumes that the binary/program is available in the cache and will throw an
-  // exception otherwise.
-  std::string GetBinaryFromCache() const {
-    return cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
-  }
-  Program GetProgramFromCache() const {
-    return cache::GetProgramFromCache(context_, precision_, routine_name_);
-  }
-  bool BinaryIsInCache() const {
-    return cache::BinaryIsInCache(device_name_, precision_, routine_name_);
-  }
-  bool ProgramIsInCache() const {
-    return cache::ProgramIsInCache(context_, precision_, routine_name_);
-  }
-
-  // Non-static variable for the precision. Note that the same variable (but static) might exist in
-  // a derived class.
-  const Precision precision_;
-
-  // The routine's name and its kernel-source in string form
-  const std::string routine_name_;
-  std::string source_string_;
-
-  // The OpenCL objects, accessible only from derived classes
-  Queue queue_;
-  EventPointer event_;
-  const Context context_;
-  const Device device_;
-
-  // OpenCL device properties
-  const std::string device_name_;
-  const size_t max_work_item_dimensions_;
-  const std::vector<size_t> max_work_item_sizes_;
-  const size_t max_work_group_size_;
-
-  // Connection to the database for all the device-specific parameters
-  const Database db_;
-};
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_ROUTINE_H_
-#endif
diff --git a/samples/haxpy.c b/samples/haxpy.c
new file mode 100644
index 00000000..3c7bb33a
--- /dev/null
+++ b/samples/haxpy.c
@@ -0,0 +1,105 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// Includes the float-to-half and half-to-float conversion utilities
+#include <clblast_half.h>
+
+// =================================================================================================
+
+// Example use of the half-precision routine HAXPY
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example HAXPY arguments
+  const size_t n = 8192;
+  const cl_half alpha = FloatToHalf(0.5f);
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host vectors with some example data
+  cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n);
+  cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n);
+  for (size_t i=0; i<n; ++i) { host_a[i] = FloatToHalf(2.2f); }
+  for (size_t i=0; i<n; ++i) { host_b[i] = FloatToHalf(0.4f); }
+  printf("Input values at index 0: alpha * a[0] + b[0] == %.3lf * %.3lf + %.3lf\n",
+         HalfToFloat(alpha), HalfToFloat(host_a[0]), HalfToFloat(host_b[0]));
+
+  // Copy the matrices to the device
+  cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
+  cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, n*sizeof(cl_half), host_a, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
+
+  // Call the HAXPY routine.
+  StatusCode status = CLBlastHaxpy(n, alpha,
+                                   device_a, 0, 1,
+                                   device_b, 0, 1,
+                                   &queue, &event);
+
+  // Wait for completion
+  clWaitForEvents(1, &event);
+
+  // Copies the result back to the host
+  clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed HAXPY with status %d\n", status);
+
+  // Prints the first output value
+  if (status == 0) {
+    printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0]));
+  }
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_a);
+  free(host_b);
+  clReleaseMemObject(device_a);
+  clReleaseMemObject(device_b);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm.cc b/samples/sgemm.cpp
index 5fe7490a..5fe7490a 100644
--- a/samples/sgemm.cc
+++ b/samples/sgemm.cpp
diff --git a/scripts/database/database.py b/scripts/database/database.py
index 8e8f37f8..49bc1801 100644
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -143,7 +143,8 @@ def CalculateDefaults(df):
 	groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
 	for name, dfgroup in groups:
 		if len(dfgroup) != 1:
-			print("[WARNING] Entries for a single kernel with multiple argument values")
+			description = dfgroup["kernel"].min() + " " + dfgroup["device_vendor"].min()
+			print("[WARNING] Entries for a single kernel with multiple argument values: " + description)
 			
 	# Defaults in general
 	groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
@@ -189,13 +190,20 @@ def GetFooter():
 
 # The start of a new C++ precision entry
 def GetPrecision(family, precision):
-	precisionstring = "Single"
-	if precision == "64":
+	precisionstring = ""
+	if precision == "16":
+		precisionstring = "Half"
+	elif precision == "32":
+		precisionstring = "Single"
+	elif precision == "64":
 		precisionstring = "Double"
 	elif precision == "3232":
 		precisionstring = "ComplexSingle"
 	elif precision == "6464":
 		precisionstring = "ComplexDouble"
+	else:
+		print("[ERROR] Unknown precision")
+		sys.exit()
 	return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n  \"%s\", Precision::k%s, {\n"
 	       % (family.title(), precisionstring, family.title(), precisionstring))
 
@@ -211,7 +219,7 @@ def PrintData(df, outputdir):
 	# Iterates over the kernel families: creates a new file per family
 	for family, dffamily in df.groupby(["kernel_family"]):
 		dffamily = dffamily.dropna(axis=1, how='all')
-		f = open(os.path.join(outputdir, family+'.h'), 'w+')
+		f = open(os.path.join(outputdir, family+'.hpp'), 'w+')
 		f.write(GetHeader(family))
 
 		# Loops over the different entries for this family and prints their headers
@@ -294,6 +302,11 @@ if len(glob.glob(glob_json)) >= 1:
 	print("## Storing the database to disk...")
 	SaveDatabase(database, file_db)
 
+# Optional: update the database here. Default is disabled, code below is just an example
+if False:
+	database = UpdateDatabase(database, ((database["kernel"] == "CopyMatrixFast") & (database["precision"] == "3232")), "arg_alpha", "2+0.5i")
+	SaveDatabase(database, file_db)
+
 # Retrieves the best performing results
 print("## Calculating the best results per device/kernel...")
 bests = GetBestResults(database)
@@ -303,7 +316,7 @@ defaults = CalculateDefaults(bests)
 bests = ConcatenateData(bests, defaults)
 
 # Outputs the data as a C++ database
-path_cpp_database = os.path.join(path_clblast, "include", "internal", "database")
+path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels")
 print("## Producing a C++ database in '"+path_cpp_database+"'...")
 PrintData(bests, path_cpp_database)
 
diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py
index 5a58ab53..5bff95d1 100644
--- a/scripts/generator/datatype.py
+++ b/scripts/generator/datatype.py
@@ -13,10 +13,13 @@
 # ==================================================================================================
 
 # Short-hands for data-types
+HLF = "half"
 FLT = "float"
 DBL = "double"
 FLT2 = "float2"
 DBL2 = "double2"
+
+HCL = "cl_half"
 F2CL = "cl_float2"
 D2CL = "cl_double2"
 
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 210f371f..cf01f79e 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -10,14 +10,14 @@
 # This script automatically generates the bodies of the following files, creating the full CLBlast
 # API interface and implementation (C, C++, and reference BLAS wrappers):
 #    clblast.h
-#    clblast.cc
+#    clblast.cpp
 #    clblast_c.h
-#    clblast_c.cc
+#    clblast_c.cpp
 #    wrapper_clblas.h
 #    wrapper_cblas.h
 # It also generates the main functions for the correctness and performance tests as found in
-#    test/correctness/routines/levelX/xYYYY.cc
-#    test/performance/routines/levelX/xYYYY.cc
+#    test/correctness/routines/levelX/xYYYY.cpp
+#    test/performance/routines/levelX/xYYYY.cpp
 # It also produces the API documentation found in doc/clblast.md
 #
 # ==================================================================================================
@@ -28,11 +28,12 @@ import os.path
 
 # Local files
 from routine import Routine
-from datatype import DataType, FLT, DBL, FLT2, DBL2, F2CL, D2CL
+from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL
 
 # ==================================================================================================
 
 # Regular data-types
+H = DataType("H", "H", HLF,  [HLF,  HLF,  HCL,  HCL],  HLF ) # half (16)
 S = DataType("S", "S", FLT,  [FLT,  FLT,  FLT,  FLT],  FLT ) # single (32)
 D = DataType("D", "D", DBL,  [DBL,  DBL,  DBL,  DBL],  DBL ) # double (64)
 C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
@@ -41,6 +42,7 @@ Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6
 # Special cases
 Sc = DataType("C", "Sc", FLT2,         [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
 Dz = DataType("Z", "Dz", DBL2,         [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
+iH = DataType("H", "iH", HLF,          [HLF,  HLF,  HLF,  HLF],  HLF ) # As H, but with integer output
 iS = DataType("S", "iS", FLT,          [FLT,  FLT,  FLT,  FLT],  FLT ) # As S, but with integer output
 iD = DataType("D", "iD", DBL,          [DBL,  DBL,  DBL,  DBL],  DBL ) # As D, but with integer output
 iC = DataType("C", "iC", FLT2,         [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
@@ -57,65 +59,85 @@ TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")
 
 # ==================================================================================================
 
+# Different possibilities for requirements
+ald_m = "The value of `a_ld` must be at least `m`."
+ald_n = "The value of `a_ld` must be at least `n`."
+ald_k_one = "The value of `a_ld` must be at least `k + 1`."
+ald_kl_ku_one = "The value of `a_ld` must be at least `kl + ku + 1`."
+ald_transa_m_k = "When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`."
+ald_trans_n_k = "When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`."
+ald_side_m_n = "When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`."
+bld_m = "The value of `b_ld` must be at least `m`."
+bld_n = "The value of `b_ld` must be at least `n`."
+bld_transb_k_n = "When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`."
+bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`."
+cld_m = "The value of `c_ld` must be at least `m`."
+cld_n = "The value of `c_ld` must be at least `n`."
+
+# ==================================================================================================
+
 # Populates a list of routines
 routines = [
 [ # Level 1: vector-vector
-  Routine(False, True,  "1", "rotg",  T,  [S,D],     [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotmg", T,  [S,D],     [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
-  Routine(False, True,  "1", "rot",   T,  [S,D],     ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotm",  T,  [S,D],     ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
-  Routine(True,  True,  "1", "swap",  T,  [S,D,C,Z], ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges the contents of vectors x and y.", []),
-  Routine(True,  True,  "1", "scal",  T,  [S,D,C,Z], ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies all elements of vector x by a scalar constant alpha.", []),
-  Routine(True,  True,  "1", "copy",  T,  [S,D,C,Z], ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector x into vector y.", []),
-  Routine(True,  True,  "1", "axpy",  T,  [S,D,C,Z], ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation y = alpha * x + y, in which x and y are vectors and alpha is a scalar constant.", []),
-  Routine(True,  True,  "1", "dot",   T,  [S,D],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies the vectors x and y element-wise and accumulates the results. The sum is stored in the dot buffer.", []),
-  Routine(True,  True,  "1", "dotu",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "dotc",  T,  [C,Z],     ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz],["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of each element in the x vector and takes the square root. The resulting L2 norm is stored in the nrm2 buffer.", []),
-  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz],["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of each element in the x vector. The results are stored in the asum buffer.", []),
-  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz],["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of each element in the x vector. The results are stored in the sum buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the x vector. The resulting integer index is stored in the imax buffer.", []),
-  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the x vector. The resulting integer index is stored in the imax buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ],["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the x vector. The resulting integer index is stored in the imin buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(False, True,  "1", "rotg",  T, [S,D],            [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotmg", T, [S,D],            [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [ # Level 2: matrix-vector
-  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation y = alpha * A * x + beta * y, in which x is an input vector, y is an input and output vector, A is an input matrix, and alpha and beta are scalars. The matrix A can optionally be transposed before performing the operation.", []),
-  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is banded instead.", []),
-  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian matrix instead.", []),
-  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian banded matrix instead.", []),
-  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is an Hermitian packed matrix instead and represented as AP.", []),
-  Routine(True,  True,  "2a", "symv",  T,  [S,D],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric instead.", []),
-  Routine(True,  True,  "2a", "sbmv",  T,  [S,D],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is symmetric and banded instead.", []),
-  Routine(True,  True,  "2a", "spmv",  T,  [S,D],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a symmetric packed matrix instead and represented as AP.", []),
-  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular instead.", []),
-  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix A is triangular and banded instead.", []),
-  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix A is a triangular packed matrix instead and repreented as AP.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
+  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],       ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],       ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],       ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],   ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
+  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],   ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
+  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],   ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
   # Level 2: matrix update
-  Routine(True,  True,  "2b", "ger",   T,  [S,D],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
-  Routine(True,  True,  "2b", "geru",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
-  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
-  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
-  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
-  Routine(True,  True,  "2b", "her2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
-  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
-  Routine(True,  True,  "2b", "syr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
-  Routine(True,  True,  "2b", "spr",   T,  [S,D],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
-  Routine(True,  True,  "2b", "syr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
-  Routine(True,  True,  "2b", "spr2",  T,  [S,D],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
+  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+  Routine(True,  True,  "2b", "geru",  T,  [C,Z],       ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],       ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],   ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],   ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "her2",  T,  [C,Z],       ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],       ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
 ],
 [ # Level 3: matrix-matrix
-  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
-  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "", []),
-  Routine(True,  True,  "3", "hemm",  T,  [C,Z],     ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "", []),
-  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "", []),
-  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "", []),
-  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "", []),
-  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "", []),
-  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "", []),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "hemm",  T,  [C,Z],       ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],   ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],   ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+],
+[ # Level X: extra routines (not part of BLAS)
+  Routine(True,  True,  "x", "omatcopy", T,  [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
 ]]
 
 # ==================================================================================================
@@ -130,6 +152,7 @@ def PrecisionToFullName(x):
 	}[x]
 
 # ==================================================================================================
+
 # Separators for the BLAS levels
 separators = ["""
 // =================================================================================================
@@ -142,8 +165,15 @@ separators = ["""
 """
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
+// =================================================================================================""",
+"""
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
 // ================================================================================================="""]
 
+# Names of the level sub-folders
+levelnames = ["1", "2", "3", "x"]
+
 # Main header/footer for source files
 header = """
 // =================================================================================================
@@ -170,7 +200,7 @@ def clblast_h(routines):
 		result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
 	return result
 
-# The C++ API implementation (.cc)
+# The C++ API implementation (.cpp)
 def clblast_cc(routines):
 	result = ""
 	for routine in routines:
@@ -207,7 +237,7 @@ def clblast_c_h(routines):
 			result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
 	return result
 
-# The C API implementation (.cc)
+# The C API implementation (.cpp)
 def clblast_c_cc(routines):
 	result = ""
 	for routine in routines:
@@ -229,21 +259,45 @@ def wrapper_clblas(routines):
 	result = ""
 	for routine in routines:
 		if routine.has_tests:
-			result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNames())
+			result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested())
 			if routine.NoScalars():
 				result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
 			for flavour in routine.flavours:
-				indent = " "*(17 + routine.Length())
 				result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
-				arguments = routine.ArgumentsWrapperCL(flavour)
-				if routine.scratch:
-					result += "  auto queue = Queue(queues[0]);\n"
-					result += "  auto context = queue.GetContext();\n"
-					result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
-					arguments += ["scratch_buffer()"]
-				result += "  return clblas"+flavour.name+routine.name+"("
-				result += (",\n"+indent).join([a for a in arguments])
-				result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
+
+				# There is a version available in clBLAS
+				if flavour.precision_name in ["S","D","C","Z"]:
+					indent = " "*(17 + routine.Length())
+					arguments = routine.ArgumentsWrapperCL(flavour)
+					if routine.scratch:
+						result += "  auto queue = Queue(queues[0]);\n"
+						result += "  auto context = queue.GetContext();\n"
+						result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
+						arguments += ["scratch_buffer()"]
+					result += "  return clblas"+flavour.name+routine.name+"("
+					result += (",\n"+indent).join([a for a in arguments])
+					result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
+
+				# There is no clBLAS available, forward the call to one of the available functions
+				else: # Half-precision
+					indent = " "*(24 + routine.Length())
+
+					# Convert to float (note: also integer buffers are stored as half/float)
+					for buf in routine.inputs + routine.outputs:
+						result += "  auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n"
+
+					# Call the float routine
+					result += "  auto status = clblasX"+routine.name+"("
+					result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
+					result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
+					result += "\n"
+
+					# Convert back to half
+					for buf in routine.outputs:
+						result += "  FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n"
+					result += "  return status;"
+
+				# Complete
 				result += "\n}\n"
 	return result
 
@@ -252,44 +306,66 @@ def wrapper_cblas(routines):
 	result = ""
 	for routine in routines:
 		if routine.has_tests:
-			result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNames())
+			result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested())
 			for flavour in routine.flavours:
-				indent = " "*(10 + routine.Length())
 				result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
-				arguments = routine.ArgumentsWrapperC(flavour)
-
-				# Double-precision scalars
-				for scalar in routine.scalars:
-					if flavour.IsComplex(scalar):
-						result += "  const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
-
-				# Special case for scalar outputs
-				assignment = ""
-				postfix = ""
-				endofline = ""
-				extra_argument = ""
-				for output_buffer in routine.outputs:
-					if output_buffer in routine.ScalarBuffersFirst():
-						if flavour in [C,Z]:
-							postfix += "_sub"
-							indent += "    "
-							extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
-						elif output_buffer in routine.IndexBuffers():
-							assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
-							indent += " "*len(assignment)
-						else:
-							assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
-							if (flavour.name in ["Sc","Dz"]):
-								assignment = assignment+".real("
-								endofline += ")"
-							else:
-								assignment = assignment+" = "
-							indent += " "*len(assignment)
 
-				result += "  "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
-				result += (",\n"+indent).join([a for a in arguments])
-				result += extra_argument+endofline+");"
-				result += "\n}\n"
+				# There is a version available in CBLAS
+				if flavour.precision_name in ["S","D","C","Z"]:
+					indent = " "*(10 + routine.Length())
+					arguments = routine.ArgumentsWrapperC(flavour)
+
+					# Complex scalars
+					for scalar in routine.scalars:
+						if flavour.IsComplex(scalar):
+							result += "  const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
+
+					# Special case for scalar outputs
+					assignment = ""
+					postfix = ""
+					endofline = ""
+					extra_argument = ""
+					for output_buffer in routine.outputs:
+						if output_buffer in routine.ScalarBuffersFirst():
+							if flavour in [C,Z]:
+								postfix += "_sub"
+								indent += "    "
+								extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
+							elif output_buffer in routine.IndexBuffers():
+								assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
+								indent += " "*len(assignment)
+							else:
+								assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
+								if (flavour.name in ["Sc","Dz"]):
+									assignment = assignment+".real("
+									endofline += ")"
+								else:
+									assignment = assignment+" = "
+								indent += " "*len(assignment)
+
+					result += "  "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
+					result += (",\n"+indent).join([a for a in arguments])
+					result += extra_argument+endofline+");\n"
+
+				# There is no CBLAS available, forward the call to one of the available functions
+				else: # Half-precision
+					indent = " "*(9 + routine.Length())
+
+					# Convert to float (note: also integer buffers are stored as half/float)
+					for buf in routine.inputs + routine.outputs:
+						result += "  auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n"
+
+					# Call the float routine
+					result += "  cblasX"+routine.name+"("
+					result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
+					result += ");\n"
+
+					# Convert back to half
+					for buf in routine.outputs:
+						result += "  FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n"
+
+				# Complete
+				result += "}\n"
 	return result
 
 # ==================================================================================================
@@ -303,14 +379,14 @@ if len(sys.argv) != 2:
 path_clblast = sys.argv[1]
 files = [
   path_clblast+"/include/clblast.h",
-  path_clblast+"/src/clblast.cc",
+  path_clblast+"/src/clblast.cpp",
   path_clblast+"/include/clblast_c.h",
-  path_clblast+"/src/clblast_c.cc",
-  path_clblast+"/test/wrapper_clblas.h",
-  path_clblast+"/test/wrapper_cblas.h",
+  path_clblast+"/src/clblast_c.cpp",
+  path_clblast+"/test/wrapper_clblas.hpp",
+  path_clblast+"/test/wrapper_cblas.hpp",
 ]
-header_lines = [84, 71, 93, 22, 29, 41]
-footer_lines = [17, 71, 19, 14, 6, 6]
+header_lines = [84, 74, 93, 22, 29, 41]
+footer_lines = [17, 75, 19, 14, 6, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
 for f in files:
@@ -332,7 +408,8 @@ for i in xrange(0,len(files)):
 	# Re-writes the body of the file
 	with open(files[i], "w") as f:
 		body = ""
-		for level in [1,2,3]:
+		levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4]
+		for level in levels:
 			body += separators[level-1]+"\n"
 			if i == 0:
 				body += clblast_h(routines[level-1])
@@ -353,39 +430,40 @@ for i in xrange(0,len(files)):
 # ==================================================================================================
 
 # Outputs all the correctness-test implementations
-for level in [1,2,3]:
+for level in [1,2,3,4]:
 	for routine in routines[level-1]:
 		if routine.has_tests:
-			filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
+			filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
 			with open(filename, "w") as f:
 				body = ""
-				body += "#include \"correctness/testblas.h\"\n"
-				body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
+				body += "#include \"test/correctness/testblas.hpp\"\n"
+				body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
 				body += "// Shortcuts to the clblast namespace\n"
 				body += "using float2 = clblast::float2;\n"
 				body += "using double2 = clblast::double2;\n\n"
 				body += "// Main function (not within the clblast namespace)\n"
 				body += "int main(int argc, char *argv[]) {\n"
+				body += "  auto errors = size_t{0};\n"
 				not_first = "false"
 				for flavour in routine.flavours:
-					body += "  clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
+					body += "  errors += clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
 					body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
 					not_first = "true"
-				body += "  return 0;\n"
+				body += "  if (errors > 0) { return 1; } else { return 0; }\n"
 				body += "}\n"
 				f.write(header+"\n")
 				f.write(body)
 				f.write(footer)
 
 # Outputs all the performance-test implementations
-for level in [1,2,3]:
+for level in [1,2,3,4]:
 	for routine in routines[level-1]:
 		if routine.has_tests:
-			filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
+			filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
 			with open(filename, "w") as f:
 				body = ""
-				body += "#include \"performance/client.h\"\n"
-				body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
+				body += "#include \"test/performance/client.hpp\"\n"
+				body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
 				body += "// Shortcuts to the clblast namespace\n"
 				body += "using float2 = clblast::float2;\n"
 				body += "using double2 = clblast::double2;\n\n"
@@ -422,7 +500,7 @@ with open(filename, "w") as f:
 	f.write("\n\n")
 
 	# Loops over the routines
-	for level in [1,2,3]:
+	for level in [1,2,3,4]:
 		for routine in routines[level-1]:
 			if routine.implemented:
 
@@ -463,7 +541,6 @@ with open(filename, "w") as f:
 						f.write("* "+requirement+"\n")
 					f.write("\n")
 
-
 				# Routine footer
 				f.write("\n\n")
 
diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py
index e5059c61..00883776 100644
--- a/scripts/generator/routine.py
+++ b/scripts/generator/routine.py
@@ -58,9 +58,9 @@ def OptionToDoc(x):
 	    'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
 	    'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
 	    'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
-	    'side': "The horizontal position of the triangular matrix, either `Side::kLeft` (141) or `Side::kRight` (142).",
-	    'triangle': "The vertical position of the triangular matrix, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
-	    'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for a non-unit values on the diagonal or `Diagonal::kUnit` (132) for a unit values on the diagonal.",
+	    'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).",
+	    'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
+	    'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.",
 	}[x]
 
 # ==================================================================================================
@@ -99,6 +99,18 @@ class Routine():
 	def IndexBuffers(self):
 		return ["imax","imin"]
 
+	# Lists of input/output buffers not index (integer)
+	def NonIndexInputs(self):
+		buffers = self.inputs[:] # make a copy
+		for i in self.IndexBuffers():
+			if i in buffers: buffers.remove(i)
+		return buffers
+	def NonIndexOutputs(self):
+		buffers = self.outputs[:] # make a copy
+		for i in self.IndexBuffers():
+			if i in buffers: buffers.remove(i)
+		return buffers
+
 	# List of buffers without 'inc' or 'ld'
 	def BuffersWithoutLdInc(self):
 		return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
@@ -119,6 +131,12 @@ class Routine():
 	def ShortNames(self):
 		return "/".join([f.name+self.name.upper() for f in self.flavours])
 
+	# As above, but excludes some
+	def ShortNamesTested(self):
+		names = [f.name+self.name.upper() for f in self.flavours]
+		if "H"+self.name.upper() in names: names.remove("H"+self.name.upper())
+		return "/".join(names)
+
 	# Determines which buffers go first (between alpha and beta) and which ones go after
 	def BuffersFirst(self):
 		if self.level == "2b":
@@ -146,6 +164,17 @@ class Routine():
 			return [", ".join(a+b+c)]
 		return []
 
+	# As above but with a '_bis' suffix for the buffer name
+	def BufferBis(self, name):
+		#if (name in self.IndexBuffers()):
+	#		return self.Buffer(name)
+		if (name in self.inputs) or (name in self.outputs):
+			a = [name+"_buffer_bis"]
+			b = [name+"_offset"]
+			c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
+			return [", ".join(a+b+c)]
+		return []
+
 	# As above but with data-types
 	def BufferDef(self, name):
 		prefix = "const " if (name in self.inputs) else ""
@@ -156,6 +185,16 @@ class Routine():
 			return [", ".join(a+b+c)]
 		return []
 
+	# As above but with data-types
+	def BufferDefWrapperCL(self, name, flavour):
+		prefix = "const " if (name in self.inputs) else ""
+		if (name in self.inputs) or (name in self.outputs):
+			a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"]
+			b = ["const size_t "+name+"_offset"]
+			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
+			return [", ".join(a+b+c)]
+		return []
+
 	# As above but as vectors
 	def BufferDefVector(self, name, flavour):
 		prefix = "const " if (name in self.inputs) else ""
@@ -179,7 +218,7 @@ class Routine():
 	# As above but with a static cast for clBLAS wrapper
 	def BufferWrapperCL(self, name):
 		if (name in self.inputs) or (name in self.outputs):
-			a = [name+"_buffer"]
+			a = [name+"_buffer()"]
 			b = [name+"_offset"]
 			c = []
 			if (name in ["x","y"]):
@@ -226,7 +265,7 @@ class Routine():
 			incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment "
 			a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."]
 			b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."]
-			c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+"."] if (name not in self.BuffersWithoutLdInc()) else []
+			c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+". This value must be greater than 0."] if (name not in self.BuffersWithoutLdInc()) else []
 			return a+b+c
 		return []
 
@@ -238,6 +277,12 @@ class Routine():
 			return [name]
 		return []
 
+	# As above, but converts from float to half
+	def ScalarHalfToFloat(self, name):
+		if name in self.scalars:
+			return ["HalfToFloat("+name+")"]
+		return []
+
 	# Retrieves the use of a scalar (alpha/beta)
 	def ScalarUse(self, name, flavour):
 		if name in self.scalars:
@@ -248,7 +293,7 @@ class Routine():
 			return [name]
 		return []
 
-	# Retrieves the use of a scalar (alpha/beta)
+	# As above, but for the clBLAS wrapper
 	def ScalarUseWrapper(self, name, flavour):
 		if name in self.scalars:
 			if name == "alpha":
@@ -258,7 +303,7 @@ class Routine():
 			return [name]
 		return []
 
-	# Retrieves the use of a scalar for CBLAS (alpha/beta)
+	# As above, but for the CBLAS wrapper
 	def ScalarUseWrapperC(self, name, flavour):
 		if name in self.scalars:
 			if flavour.IsComplex(name):
@@ -321,7 +366,7 @@ class Routine():
 	# Retrieves the documentation of the sizes
 	def SizesDoc(self):
 		if self.sizes:
-			definitions = ["`const size_t "+s+"`: Integer size argument." for s in self.sizes]
+			definitions = ["`const size_t "+s+"`: Integer size argument. This value must be positive." for s in self.sizes]
 			return definitions
 		return []
 
@@ -371,12 +416,34 @@ class Routine():
 	# Retrieves the documentation of the options
 	def OptionsDoc(self):
 		if self.options:
-			definitions = ["`const "+OptionToCLBlast(o)+"`: "+OptionToDoc(o) for o in self.options]
+			definitions = ["`const "+OptionToCLBlast(o)+" "+o+"`: "+OptionToDoc(o) for o in self.options]
 			return definitions
 		return []
 
 	# ==============================================================================================
 
+	# Retrieves a combination of all the argument names (no types)
+	def Arguments(self):
+		return (self.Options() + self.Sizes() +
+		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
+		        self.Scalar("alpha") +
+		        list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
+		        self.Scalar("beta") +
+		        list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
+
+	# As above, but with conversions from half to float
+	def ArgumentsHalf(self):
+		return (self.Options() + self.Sizes() +
+		        list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) +
+		        self.ScalarHalfToFloat("alpha") +
+		        list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) +
+		        self.ScalarHalfToFloat("beta") +
+		        list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
+
 	# Retrieves a combination of all the argument names, with Claduc casts
 	def ArgumentsCladuc(self, flavour, indent):
 		return (self.Options() + self.Sizes() +
@@ -388,7 +455,7 @@ class Routine():
 		        list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
 		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
 
-	# Retrieves a combination of all the argument names, with CLBlast casts
+	# As above, but with CLBlast casts
 	def ArgumentsCast(self, flavour, indent):
 		return (self.OptionsCast(indent) + self.Sizes() +
 		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
@@ -434,12 +501,12 @@ class Routine():
 	# As above, but clBLAS wrapper plain datatypes
 	def ArgumentsDefWrapperCL(self, flavour):
 		return (self.OptionsDefWrapperCL() + self.SizesDef() +
-		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
+		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) +
 		        self.ScalarDefPlain("alpha", flavour) +
-		        list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
+		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) +
 		        self.ScalarDefPlain("beta", flavour) +
-		        list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
+		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) +
+		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) +
 		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
 
 	# As above, but CBLAS wrapper plain datatypes
@@ -480,7 +547,7 @@ class Routine():
 
 	# Retrieves a list of routine requirements for documentation
 	def RequirementsDoc(self):
-		return []
+		return self.requirements
 
 	# ==============================================================================================
 
diff --git a/test/performance/graphs/common.r b/scripts/graphs/common.r
index cd68cf26..cd68cf26 100644
--- a/test/performance/graphs/common.r
+++ b/scripts/graphs/common.r
diff --git a/test/performance/graphs/xaxpy.r b/scripts/graphs/xaxpy.r
index 187590aa..187590aa 100644
--- a/test/performance/graphs/xaxpy.r
+++ b/scripts/graphs/xaxpy.r
diff --git a/test/performance/graphs/xgemm.r b/scripts/graphs/xgemm.r
index e758f460..e758f460 100755
--- a/test/performance/graphs/xgemm.r
+++ b/scripts/graphs/xgemm.r
diff --git a/test/performance/graphs/xgemv.r b/scripts/graphs/xgemv.r
index 9a8040f7..9a8040f7 100644
--- a/test/performance/graphs/xgemv.r
+++ b/scripts/graphs/xgemv.r
diff --git a/test/performance/graphs/xsymm.r b/scripts/graphs/xsymm.r
index a65bb16f..a65bb16f 100644
--- a/test/performance/graphs/xsymm.r
+++ b/scripts/graphs/xsymm.r
diff --git a/test/performance/graphs/xsyr2k.r b/scripts/graphs/xsyr2k.r
index 4b2dd4a0..4b2dd4a0 100644
--- a/test/performance/graphs/xsyr2k.r
+++ b/scripts/graphs/xsyr2k.r
diff --git a/test/performance/graphs/xsyrk.r b/scripts/graphs/xsyrk.r
index 4ab46c9f..4ab46c9f 100644
--- a/test/performance/graphs/xsyrk.r
+++ b/scripts/graphs/xsyrk.r
diff --git a/test/performance/graphs/xtrmm.r b/scripts/graphs/xtrmm.r
index c2faaa8b..c2faaa8b 100644
--- a/test/performance/graphs/xtrmm.r
+++ b/scripts/graphs/xtrmm.r
diff --git a/src/buffer_test.hpp b/src/buffer_test.hpp
new file mode 100644
index 00000000..80f5243f
--- /dev/null
+++ b/src/buffer_test.hpp
@@ -0,0 +1,121 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the tests for the OpenCL buffers (matrices and vectors). These tests are
+// templated and thus header-only.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_BUFFER_TEST_H_
+#define CLBLAST_BUFFER_TEST_H_
+
+#include "clblast.h"
+
+namespace clblast {
+// =================================================================================================
+
+// Tests matrix 'A' for validity
+template <typename T>
+StatusCode TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
+                       const size_t offset, const size_t ld) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
+  try {
+    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix 'B' for validity
+template <typename T>
+StatusCode TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
+                       const size_t offset, const size_t ld) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
+  try {
+    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryB; }
+  } catch (...) { return StatusCode::kInvalidMatrixB; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix 'C' for validity
+template <typename T>
+StatusCode TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
+                       const size_t offset, const size_t ld) {
+  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
+  try {
+    const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryC; }
+  } catch (...) { return StatusCode::kInvalidMatrixC; }
+  return StatusCode::kSuccess;
+}
+
+// Tests matrix 'AP' for validity
+template <typename T>
+StatusCode TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+  try {
+    const auto required_size = (((n * (n + 1)) / 2) + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryA; }
+  } catch (...) { return StatusCode::kInvalidMatrixA; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector 'X' for validity
+template <typename T>
+StatusCode TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
+                       const size_t inc) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
+  try {
+    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryX; }
+  } catch (...) { return StatusCode::kInvalidVectorX; }
+  return StatusCode::kSuccess;
+}
+
+// Tests vector 'Y' for validity
+template <typename T>
+StatusCode TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
+                       const size_t inc) {
+  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
+  try {
+    const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryY; }
+  } catch (...) { return StatusCode::kInvalidVectorY; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Tests vector 'scalar' for validity
+template <typename T>
+StatusCode TestVectorScalar(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+  try {
+    const auto required_size = (n + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
+  } catch (...) { return StatusCode::kInvalidVectorScalar; }
+  return StatusCode::kSuccess;
+}
+
+// Tests vector 'index' for validity
+template <typename T>
+StatusCode TestVectorIndex(const size_t n, const Buffer<T> &buffer, const size_t offset) {
+  try {
+    const auto required_size = (n + offset) * sizeof(T);
+    if (buffer.GetSize() < required_size) { return StatusCode::kInsufficientMemoryScalar; }
+  } catch (...) { return StatusCode::kInvalidVectorScalar; }
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_BUFFER_TEST_H_
+#endif
diff --git a/src/cache.cc b/src/cache.cpp
index 4dbdb711..cd9055d0 100644
--- a/src/cache.cc
+++ b/src/cache.cpp
@@ -15,10 +15,9 @@
 #include <vector>
 #include <mutex>
 
-#include "internal/cache.h"
+#include "cache.hpp"
 
 namespace clblast {
-namespace cache {
 // =================================================================================================
 
 // Stores the compiled binary or IR in the cache
@@ -98,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================
 
 // Clears the cache of stored binaries and programs
-StatusCode ClearCache() {
+StatusCode CacheClearAll() {
   binary_cache_mutex_.lock();
   binary_cache_.clear();
   binary_cache_mutex_.unlock();
@@ -109,5 +108,4 @@ StatusCode ClearCache() {
 }
 
 // =================================================================================================
-} // namespace cache
 } // namespace clblast
diff --git a/include/internal/cache.h b/src/cache.hpp
index 4a11b70f..0d74d7bc 100644
--- a/include/internal/cache.h
+++ b/src/cache.hpp
@@ -18,10 +18,9 @@
 #include <vector>
 #include <mutex>
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 namespace clblast {
-namespace cache {
 // =================================================================================================
 
 // The cache of compiled OpenCL binaries, along with some meta-data
@@ -90,10 +89,9 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
 // =================================================================================================
 
 // Clears the cache of stored binaries
-StatusCode ClearCache();
+StatusCode CacheClearAll();
 
 // =================================================================================================
-} // namespace cache
 } // namespace clblast
 
 // CLBLAST_CACHE_H_
diff --git a/src/clblast.cc b/src/clblast.cpp
index 8a9465c3..88d60772 100644
--- a/src/clblast.cc
+++ b/src/clblast.cpp
@@ -16,57 +16,60 @@
 #include <string>
 
 #include "clblast.h"
-#include "internal/public_api.h"
-#include "internal/cache.h"
+#include "public_api.hpp"
+#include "cache.hpp"
 
 // BLAS level-1 includes
-#include "internal/routines/level1/xswap.h"
-#include "internal/routines/level1/xscal.h"
-#include "internal/routines/level1/xcopy.h"
-#include "internal/routines/level1/xaxpy.h"
-#include "internal/routines/level1/xdot.h"
-#include "internal/routines/level1/xdotu.h"
-#include "internal/routines/level1/xdotc.h"
-#include "internal/routines/level1/xnrm2.h"
-#include "internal/routines/level1/xasum.h"
-#include "internal/routines/level1/xsum.h" // non-BLAS function
-#include "internal/routines/level1/xamax.h"
-#include "internal/routines/level1/xmax.h" // non-BLAS function
-#include "internal/routines/level1/xmin.h" // non-BLAS function
+#include "routines/level1/xswap.hpp"
+#include "routines/level1/xscal.hpp"
+#include "routines/level1/xcopy.hpp"
+#include "routines/level1/xaxpy.hpp"
+#include "routines/level1/xdot.hpp"
+#include "routines/level1/xdotu.hpp"
+#include "routines/level1/xdotc.hpp"
+#include "routines/level1/xnrm2.hpp"
+#include "routines/level1/xasum.hpp"
+#include "routines/level1/xsum.hpp" // non-BLAS routine
+#include "routines/level1/xamax.hpp"
+#include "routines/level1/xmax.hpp" // non-BLAS routine
+#include "routines/level1/xmin.hpp" // non-BLAS routine
 
 // BLAS level-2 includes
-#include "internal/routines/level2/xgemv.h"
-#include "internal/routines/level2/xgbmv.h"
-#include "internal/routines/level2/xhemv.h"
-#include "internal/routines/level2/xhbmv.h"
-#include "internal/routines/level2/xhpmv.h"
-#include "internal/routines/level2/xsymv.h"
-#include "internal/routines/level2/xsbmv.h"
-#include "internal/routines/level2/xspmv.h"
-#include "internal/routines/level2/xtrmv.h"
-#include "internal/routines/level2/xtbmv.h"
-#include "internal/routines/level2/xtpmv.h"
-#include "internal/routines/level2/xger.h"
-#include "internal/routines/level2/xgeru.h"
-#include "internal/routines/level2/xgerc.h"
-#include "internal/routines/level2/xher.h"
-#include "internal/routines/level2/xhpr.h"
-#include "internal/routines/level2/xher2.h"
-#include "internal/routines/level2/xhpr2.h"
-#include "internal/routines/level2/xsyr.h"
-#include "internal/routines/level2/xspr.h"
-#include "internal/routines/level2/xsyr2.h"
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xgemv.hpp"
+#include "routines/level2/xgbmv.hpp"
+#include "routines/level2/xhemv.hpp"
+#include "routines/level2/xhbmv.hpp"
+#include "routines/level2/xhpmv.hpp"
+#include "routines/level2/xsymv.hpp"
+#include "routines/level2/xsbmv.hpp"
+#include "routines/level2/xspmv.hpp"
+#include "routines/level2/xtrmv.hpp"
+#include "routines/level2/xtbmv.hpp"
+#include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xger.hpp"
+#include "routines/level2/xgeru.hpp"
+#include "routines/level2/xgerc.hpp"
+#include "routines/level2/xher.hpp"
+#include "routines/level2/xhpr.hpp"
+#include "routines/level2/xher2.hpp"
+#include "routines/level2/xhpr2.hpp"
+#include "routines/level2/xsyr.hpp"
+#include "routines/level2/xspr.hpp"
+#include "routines/level2/xsyr2.hpp"
+#include "routines/level2/xspr2.hpp"
 
 // BLAS level-3 includes
-#include "internal/routines/level3/xgemm.h"
-#include "internal/routines/level3/xsymm.h"
-#include "internal/routines/level3/xhemm.h"
-#include "internal/routines/level3/xsyrk.h"
-#include "internal/routines/level3/xherk.h"
-#include "internal/routines/level3/xsyr2k.h"
-#include "internal/routines/level3/xher2k.h"
-#include "internal/routines/level3/xtrmm.h"
+#include "routines/level3/xgemm.hpp"
+#include "routines/level3/xsymm.hpp"
+#include "routines/level3/xhemm.hpp"
+#include "routines/level3/xsyrk.hpp"
+#include "routines/level3/xherk.hpp"
+#include "routines/level3/xsyr2k.hpp"
+#include "routines/level3/xher2k.hpp"
+#include "routines/level3/xtrmm.hpp"
+
+// Level-x includes (non-BLAS)
+#include "routines/levelx/xomatcopy.hpp"
 
 namespace clblast {
 
@@ -160,7 +163,7 @@ template StatusCode PUBLIC_API Rotm<double>(const size_t,
                                             cl_mem, const size_t,
                                             cl_command_queue*, cl_event*);
 
-// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP
+// Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP
 template <typename T>
 StatusCode Swap(const size_t n,
                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -190,8 +193,12 @@ template StatusCode PUBLIC_API Swap<double2>(const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Swap<half>(const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL
+// Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL
 template <typename T>
 StatusCode Scal(const size_t n,
                 const T alpha,
@@ -221,8 +228,12 @@ template StatusCode PUBLIC_API Scal<double2>(const size_t,
                                              const double2,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Scal<half>(const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY
+// Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY
 template <typename T>
 StatusCode Copy(const size_t n,
                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
@@ -252,8 +263,12 @@ template StatusCode PUBLIC_API Copy<double2>(const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Copy<half>(const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY
+// Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY
 template <typename T>
 StatusCode Axpy(const size_t n,
                 const T alpha,
@@ -289,8 +304,13 @@ template StatusCode PUBLIC_API Axpy<double2>(const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Axpy<half>(const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Dot product of two vectors: SDOT/DDOT
+// Dot product of two vectors: SDOT/DDOT/HDOT
 template <typename T>
 StatusCode Dot(const size_t n,
                cl_mem dot_buffer, const size_t dot_offset,
@@ -316,6 +336,11 @@ template StatusCode PUBLIC_API Dot<double>(const size_t,
                                            const cl_mem, const size_t, const size_t,
                                            const cl_mem, const size_t, const size_t,
                                            cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Dot<half>(const size_t,
+                                         cl_mem, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
 
 // Dot product of two complex vectors: CDOTU/ZDOTU
 template <typename T>
@@ -371,7 +396,7 @@ template StatusCode PUBLIC_API Dotc<double2>(const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2
+// Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2
 template <typename T>
 StatusCode Nrm2(const size_t n,
                 cl_mem nrm2_buffer, const size_t nrm2_offset,
@@ -401,8 +426,12 @@ template StatusCode PUBLIC_API Nrm2<double2>(const size_t,
                                              cl_mem, const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Nrm2<half>(const size_t,
+                                          cl_mem, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM
+// Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM
 template <typename T>
 StatusCode Asum(const size_t n,
                 cl_mem asum_buffer, const size_t asum_offset,
@@ -432,8 +461,12 @@ template StatusCode PUBLIC_API Asum<double2>(const size_t,
                                              cl_mem, const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Asum<half>(const size_t,
+                                          cl_mem, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM
+// Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM
 template <typename T>
 StatusCode Sum(const size_t n,
                cl_mem sum_buffer, const size_t sum_offset,
@@ -463,8 +496,12 @@ template StatusCode PUBLIC_API Sum<double2>(const size_t,
                                             cl_mem, const size_t,
                                             const cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Sum<half>(const size_t,
+                                         cl_mem, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
 
-// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX
+// Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 template <typename T>
 StatusCode Amax(const size_t n,
                 cl_mem imax_buffer, const size_t imax_offset,
@@ -494,8 +531,12 @@ template StatusCode PUBLIC_API Amax<double2>(const size_t,
                                              cl_mem, const size_t,
                                              const cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Amax<half>(const size_t,
+                                          cl_mem, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX
+// Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX
 template <typename T>
 StatusCode Max(const size_t n,
                cl_mem imax_buffer, const size_t imax_offset,
@@ -525,8 +566,12 @@ template StatusCode PUBLIC_API Max<double2>(const size_t,
                                             cl_mem, const size_t,
                                             const cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Max<half>(const size_t,
+                                         cl_mem, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
 
-// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN
+// Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN
 template <typename T>
 StatusCode Min(const size_t n,
                cl_mem imin_buffer, const size_t imin_offset,
@@ -556,12 +601,16 @@ template StatusCode PUBLIC_API Min<double2>(const size_t,
                                             cl_mem, const size_t,
                                             const cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Min<half>(const size_t,
+                                         cl_mem, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
 
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
 // =================================================================================================
 
-// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV
+// General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV
 template <typename T>
 StatusCode Gemv(const Layout layout, const Transpose a_transpose,
                 const size_t m, const size_t n,
@@ -615,8 +664,16 @@ template StatusCode PUBLIC_API Gemv<double2>(const Layout, const Transpose,
                                              const double2,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Gemv<half>(const Layout, const Transpose,
+                                          const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV
+// General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV
 template <typename T>
 StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
                 const size_t m, const size_t n, const size_t kl, const size_t ku,
@@ -670,6 +727,14 @@ template StatusCode PUBLIC_API Gbmv<double2>(const Layout, const Transpose,
                                              const double2,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Gbmv<half>(const Layout, const Transpose,
+                                          const size_t, const size_t, const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
 // Hermitian matrix-vector multiplication: CHEMV/ZHEMV
 template <typename T>
@@ -788,7 +853,7 @@ template StatusCode PUBLIC_API Hpmv<double2>(const Layout, const Triangle,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// Symmetric matrix-vector multiplication: SSYMV/DSYMV
+// Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV
 template <typename T>
 StatusCode Symv(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -826,8 +891,16 @@ template StatusCode PUBLIC_API Symv<double>(const Layout, const Triangle,
                                             const double,
                                             cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Symv<half>(const Layout, const Triangle,
+                                          const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV
+// Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV
 template <typename T>
 StatusCode Sbmv(const Layout layout, const Triangle triangle,
                 const size_t n, const size_t k,
@@ -865,8 +938,16 @@ template StatusCode PUBLIC_API Sbmv<double>(const Layout, const Triangle,
                                             const double,
                                             cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Sbmv<half>(const Layout, const Triangle,
+                                          const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV
+// Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV
 template <typename T>
 StatusCode Spmv(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -904,8 +985,16 @@ template StatusCode PUBLIC_API Spmv<double>(const Layout, const Triangle,
                                             const double,
                                             cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Spmv<half>(const Layout, const Triangle,
+                                          const size_t,
+                                          const half,
+                                          const cl_mem, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV
+// Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV
 template <typename T>
 StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t n,
@@ -941,8 +1030,13 @@ template StatusCode PUBLIC_API Trmv<double2>(const Layout, const Triangle, const
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Trmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
+                                          const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV
+// Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV
 template <typename T>
 StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t n, const size_t k,
@@ -978,8 +1072,13 @@ template StatusCode PUBLIC_API Tbmv<double2>(const Layout, const Triangle, const
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Tbmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
+                                          const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV
+// Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV
 template <typename T>
 StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t n,
@@ -1015,6 +1114,11 @@ template StatusCode PUBLIC_API Tpmv<double2>(const Layout, const Triangle, const
                                              const cl_mem, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Tpmv<half>(const Layout, const Triangle, const Transpose, const Diagonal,
+                                          const size_t,
+                                          const cl_mem, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
 // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
@@ -1106,7 +1210,7 @@ template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// General rank-1 matrix update: SGER/DGER
+// General rank-1 matrix update: SGER/DGER/HGER
 template <typename T>
 StatusCode Ger(const Layout layout,
                const size_t m, const size_t n,
@@ -1140,6 +1244,13 @@ template StatusCode PUBLIC_API Ger<double>(const Layout,
                                            const cl_mem, const size_t, const size_t,
                                            cl_mem, const size_t, const size_t,
                                            cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Ger<half>(const Layout,
+                                         const size_t, const size_t,
+                                         const half,
+                                         const cl_mem, const size_t, const size_t,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
 
 // General rank-1 complex matrix update: CGERU/ZGERU
 template <typename T>
@@ -1343,7 +1454,7 @@ template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle,
                                              cl_mem, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// Symmetric rank-1 matrix update: SSYR/DSYR
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
 template <typename T>
 StatusCode Syr(const Layout layout, const Triangle triangle,
                const size_t n,
@@ -1373,8 +1484,14 @@ template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle,
                                            const cl_mem, const size_t, const size_t,
                                            cl_mem, const size_t, const size_t,
                                            cl_command_queue*, cl_event*);
-
-// Symmetric packed rank-1 matrix update: SSPR/DSPR
+template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle,
+                                         const size_t,
+                                         const half,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_mem, const size_t, const size_t,
+                                         cl_command_queue*, cl_event*);
+
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
 template <typename T>
 StatusCode Spr(const Layout layout, const Triangle triangle,
                const size_t n,
@@ -1404,8 +1521,14 @@ template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle,
                                            const cl_mem, const size_t, const size_t,
                                            cl_mem, const size_t,
                                            cl_command_queue*, cl_event*);
-
-// Symmetric rank-2 matrix update: SSYR2/DSYR2
+template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle,
+                                         const size_t,
+                                         const half,
+                                         const cl_mem, const size_t, const size_t,
+                                         cl_mem, const size_t,
+                                         cl_command_queue*, cl_event*);
+
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
 template <typename T>
 StatusCode Syr2(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -1439,8 +1562,15 @@ template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle,
                                             const cl_mem, const size_t, const size_t,
                                             cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle,
+                                          const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
 template <typename T>
 StatusCode Spr2(const Layout layout, const Triangle triangle,
                 const size_t n,
@@ -1474,12 +1604,19 @@ template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle,
                                             const cl_mem, const size_t, const size_t,
                                             cl_mem, const size_t,
                                             cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle,
+                                          const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t,
+                                          cl_command_queue*, cl_event*);
 
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
 // =================================================================================================
 
-// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM
+// General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM
 template <typename T>
 StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
                 const size_t m, const size_t n, const size_t k,
@@ -1533,8 +1670,16 @@ template StatusCode PUBLIC_API Gemm<double2>(const Layout, const Transpose, cons
                                              const double2,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Gemm<half>(const Layout, const Transpose, const Transpose,
+                                          const size_t, const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM
+// Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM
 template <typename T>
 StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
                 const size_t m, const size_t n,
@@ -1588,6 +1733,14 @@ template StatusCode PUBLIC_API Symm<double2>(const Layout, const Side, const Tri
                                              const double2,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Symm<half>(const Layout, const Side, const Triangle,
+                                          const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
 // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM
 template <typename T>
@@ -1628,7 +1781,7 @@ template StatusCode PUBLIC_API Hemm<double2>(const Layout, const Side, const Tri
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
 
-// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK
+// Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK
 template <typename T>
 StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
                 const size_t n, const size_t k,
@@ -1676,6 +1829,13 @@ template StatusCode PUBLIC_API Syrk<double2>(const Layout, const Triangle, const
                                              const double2,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Syrk<half>(const Layout, const Triangle, const Transpose,
+                                          const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          const half,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
 // Rank-K update of a hermitian matrix: CHERK/ZHERK
 template <typename T>
@@ -1712,7 +1872,7 @@ template StatusCode PUBLIC_API Herk<double>(const Layout, const Triangle, const
                                             cl_mem, const size_t, const size_t,
                                             cl_command_queue*, cl_event*);
 
-// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K
+// Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K
 template <typename T>
 StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
                  const size_t n, const size_t k,
@@ -1766,6 +1926,14 @@ template StatusCode PUBLIC_API Syr2k<double2>(const Layout, const Triangle, cons
                                               const double2,
                                               cl_mem, const size_t, const size_t,
                                               cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Syr2k<half>(const Layout, const Triangle, const Transpose,
+                                           const size_t, const size_t,
+                                           const half,
+                                           const cl_mem, const size_t, const size_t,
+                                           const cl_mem, const size_t, const size_t,
+                                           const half,
+                                           cl_mem, const size_t, const size_t,
+                                           cl_command_queue*, cl_event*);
 
 // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K
 template <typename T, typename U>
@@ -1806,7 +1974,7 @@ template StatusCode PUBLIC_API Her2k<double2,double>(const Layout, const Triangl
                                                      cl_mem, const size_t, const size_t,
                                                      cl_command_queue*, cl_event*);
 
-// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM
+// Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM
 template <typename T>
 StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
                 const size_t m, const size_t n,
@@ -1848,8 +2016,14 @@ template StatusCode PUBLIC_API Trmm<double2>(const Layout, const Side, const Tri
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Trmm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+                                          const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
 
-// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM
+// Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM/HTRSM
 template <typename T>
 StatusCode Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
                 const size_t, const size_t,
@@ -1883,13 +2057,73 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri
                                              const cl_mem, const size_t, const size_t,
                                              cl_mem, const size_t, const size_t,
                                              cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triangle, const Transpose, const Diagonal,
+                                          const size_t, const size_t,
+                                          const half,
+                                          const cl_mem, const size_t, const size_t,
+                                          cl_mem, const size_t, const size_t,
+                                          cl_command_queue*, cl_event*);
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto routine = Xomatcopy<T>(queue_cpp, event);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoOmatcopy(layout, a_transpose,
+                            m, n,
+                            alpha,
+                            Buffer<T>(a_buffer), a_offset, a_ld,
+                            Buffer<T>(b_buffer), b_offset, b_ld);
+}
+template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose,
+                                               const size_t, const size_t,
+                                               const float,
+                                               const cl_mem, const size_t, const size_t,
+                                               cl_mem, const size_t, const size_t,
+                                               cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose,
+                                                const size_t, const size_t,
+                                                const double,
+                                                const cl_mem, const size_t, const size_t,
+                                                cl_mem, const size_t, const size_t,
+                                                cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose,
+                                                const size_t, const size_t,
+                                                const float2,
+                                                const cl_mem, const size_t, const size_t,
+                                                cl_mem, const size_t, const size_t,
+                                                cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose,
+                                                 const size_t, const size_t,
+                                                 const double2,
+                                                 const cl_mem, const size_t, const size_t,
+                                                 cl_mem, const size_t, const size_t,
+                                                 cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
+                                              const size_t, const size_t,
+                                              const half,
+                                              const cl_mem, const size_t, const size_t,
+                                              cl_mem, const size_t, const size_t,
+                                              cl_command_queue*, cl_event*);
 
 // =================================================================================================
 
 // Clears the cache of stored binaries
-StatusCode ClearCache() { return cache::ClearCache(); }
+StatusCode ClearCache() { return CacheClearAll(); }
 
 // Fills the cache with all binaries for a specific device
+// TODO: Add half-precision FP16 set-up calls
 StatusCode FillCache(const cl_device_id device) {
   try {
 
@@ -1938,7 +2172,7 @@ StatusCode FillCache(const cl_device_id device) {
     Xsyr2<float>(queue, nullptr).SetUp(); Xsyr2<double>(queue, nullptr).SetUp();
     Xspr2<float>(queue, nullptr).SetUp(); Xspr2<double>(queue, nullptr).SetUp();
 
-    // Runs all the level 1 set-up functions
+    // Runs all the level 3 set-up functions
     Xgemm<float>(queue, nullptr).SetUp(); Xgemm<double>(queue, nullptr).SetUp(); Xgemm<float2>(queue, nullptr).SetUp(); Xgemm<double2>(queue, nullptr).SetUp();
     Xsymm<float>(queue, nullptr).SetUp(); Xsymm<double>(queue, nullptr).SetUp(); Xsymm<float2>(queue, nullptr).SetUp(); Xsymm<double2>(queue, nullptr).SetUp();
     Xhemm<float2>(queue, nullptr).SetUp(); Xhemm<double2>(queue, nullptr).SetUp();
@@ -1948,6 +2182,9 @@ StatusCode FillCache(const cl_device_id device) {
     Xher2k<float2,float>(queue, nullptr).SetUp(); Xher2k<double2,double>(queue, nullptr).SetUp();
     Xtrmm<float>(queue, nullptr).SetUp(); Xtrmm<double>(queue, nullptr).SetUp(); Xtrmm<float2>(queue, nullptr).SetUp(); Xtrmm<double2>(queue, nullptr).SetUp();
 
+    // Runs all the level 3 set-up functions
+    Xomatcopy<float>(queue, nullptr).SetUp(); Xomatcopy<double>(queue, nullptr).SetUp(); Xomatcopy<float2>(queue, nullptr).SetUp(); Xomatcopy<double2>(queue, nullptr).SetUp();
+
   } catch (...) { return StatusCode::kBuildProgramFailure; }
   return StatusCode::kSuccess;
 }
diff --git a/src/clblast_c.cc b/src/clblast_c.cpp
index 1fc63de2..9ea2c884 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cpp
@@ -15,7 +15,7 @@
 
 #include "clblast_c.h"
 #include "clblast.h"
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -178,6 +178,16 @@ StatusCode CLBlastZswap(const size_t n,
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHswap(const size_t n,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Swap<half>(n,
+                                    x_buffer, x_offset, x_inc,
+                                    y_buffer, y_offset, y_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SCAL
 StatusCode CLBlastSscal(const size_t n,
@@ -220,6 +230,16 @@ StatusCode CLBlastZscal(const size_t n,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHscal(const size_t n,
+                        const cl_half alpha,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Scal(n,
+                              alpha,
+                              x_buffer, x_offset, x_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // COPY
 StatusCode CLBlastScopy(const size_t n,
@@ -262,6 +282,16 @@ StatusCode CLBlastZcopy(const size_t n,
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHcopy(const size_t n,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Copy<half>(n,
+                                    x_buffer, x_offset, x_inc,
+                                    y_buffer, y_offset, y_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // AXPY
 StatusCode CLBlastSaxpy(const size_t n,
@@ -312,6 +342,18 @@ StatusCode CLBlastZaxpy(const size_t n,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHaxpy(const size_t n,
+                        const cl_half alpha,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Axpy(n,
+                              alpha,
+                              x_buffer, x_offset, x_inc,
+                              y_buffer, y_offset, y_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // DOT
 StatusCode CLBlastSdot(const size_t n,
@@ -338,6 +380,18 @@ StatusCode CLBlastDdot(const size_t n,
                                      queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHdot(const size_t n,
+                       cl_mem dot_buffer, const size_t dot_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Dot<half>(n,
+                                   dot_buffer, dot_offset,
+                                   x_buffer, x_offset, x_inc,
+                                   y_buffer, y_offset, y_inc,
+                                   queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // DOTU
 StatusCode CLBlastCdotu(const size_t n,
@@ -432,6 +486,16 @@ StatusCode CLBlastDznrm2(const size_t n,
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHnrm2(const size_t n,
+                        cl_mem nrm2_buffer, const size_t nrm2_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Nrm2<half>(n,
+                                    nrm2_buffer, nrm2_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // ASUM
 StatusCode CLBlastSasum(const size_t n,
@@ -474,6 +538,16 @@ StatusCode CLBlastDzasum(const size_t n,
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHasum(const size_t n,
+                        cl_mem asum_buffer, const size_t asum_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Asum<half>(n,
+                                    asum_buffer, asum_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SUM
 StatusCode CLBlastSsum(const size_t n,
@@ -516,6 +590,16 @@ StatusCode CLBlastDzsum(const size_t n,
                                       queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsum(const size_t n,
+                       cl_mem sum_buffer, const size_t sum_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Sum<half>(n,
+                                   sum_buffer, sum_offset,
+                                   x_buffer, x_offset, x_inc,
+                                   queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // AMAX
 StatusCode CLBlastiSamax(const size_t n,
@@ -558,6 +642,16 @@ StatusCode CLBlastiZamax(const size_t n,
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastiHamax(const size_t n,
+                        cl_mem imax_buffer, const size_t imax_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Amax<half>(n,
+                                    imax_buffer, imax_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // MAX
 StatusCode CLBlastiSmax(const size_t n,
@@ -600,6 +694,16 @@ StatusCode CLBlastiZmax(const size_t n,
                                       queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastiHmax(const size_t n,
+                       cl_mem imax_buffer, const size_t imax_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Max<half>(n,
+                                   imax_buffer, imax_offset,
+                                   x_buffer, x_offset, x_inc,
+                                   queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // MIN
 StatusCode CLBlastiSmin(const size_t n,
@@ -642,6 +746,16 @@ StatusCode CLBlastiZmin(const size_t n,
                                       queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastiHmin(const size_t n,
+                       cl_mem imin_buffer, const size_t imin_offset,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Min<half>(n,
+                                   imin_buffer, imin_offset,
+                                   x_buffer, x_offset, x_inc,
+                                   queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
@@ -724,6 +838,25 @@ StatusCode CLBlastZgemv(const Layout layout, const Transpose a_transpose,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHgemv(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Gemv(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Transpose>(a_transpose),
+                              m, n,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              x_buffer, x_offset, x_inc,
+                              beta,
+                              y_buffer, y_offset, y_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // GBMV
 StatusCode CLBlastSgbmv(const Layout layout, const Transpose a_transpose,
@@ -802,6 +935,25 @@ StatusCode CLBlastZgbmv(const Layout layout, const Transpose a_transpose,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHgbmv(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n, const size_t kl, const size_t ku,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Gbmv(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Transpose>(a_transpose),
+                              m, n, kl, ku,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              x_buffer, x_offset, x_inc,
+                              beta,
+                              y_buffer, y_offset, y_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // HEMV
 StatusCode CLBlastChemv(const Layout layout, const Triangle triangle,
@@ -962,6 +1114,25 @@ StatusCode CLBlastDsymv(const Layout layout, const Triangle triangle,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsymv(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Symv(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Triangle>(triangle),
+                              n,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              x_buffer, x_offset, x_inc,
+                              beta,
+                              y_buffer, y_offset, y_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SBMV
 StatusCode CLBlastSsbmv(const Layout layout, const Triangle triangle,
@@ -1002,6 +1173,25 @@ StatusCode CLBlastDsbmv(const Layout layout, const Triangle triangle,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsbmv(const Layout layout, const Triangle triangle,
+                        const size_t n, const size_t k,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Sbmv(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Triangle>(triangle),
+                              n, k,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              x_buffer, x_offset, x_inc,
+                              beta,
+                              y_buffer, y_offset, y_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SPMV
 StatusCode CLBlastSspmv(const Layout layout, const Triangle triangle,
@@ -1042,6 +1232,25 @@ StatusCode CLBlastDspmv(const Layout layout, const Triangle triangle,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHspmv(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem ap_buffer, const size_t ap_offset,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_half beta,
+                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Spmv(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Triangle>(triangle),
+                              n,
+                              alpha,
+                              ap_buffer, ap_offset,
+                              x_buffer, x_offset, x_inc,
+                              beta,
+                              y_buffer, y_offset, y_inc,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // TRMV
 StatusCode CLBlastStrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -1104,6 +1313,21 @@ StatusCode CLBlastZtrmv(const Layout layout, const Triangle triangle, const Tran
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHtrmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t n,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Trmv<half>(static_cast<clblast::Layout>(layout),
+                                    static_cast<clblast::Triangle>(triangle),
+                                    static_cast<clblast::Transpose>(a_transpose),
+                                    static_cast<clblast::Diagonal>(diagonal),
+                                    n,
+                                    a_buffer, a_offset, a_ld,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // TBMV
 StatusCode CLBlastStbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -1166,6 +1390,21 @@ StatusCode CLBlastZtbmv(const Layout layout, const Triangle triangle, const Tran
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHtbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t n, const size_t k,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Tbmv<half>(static_cast<clblast::Layout>(layout),
+                                    static_cast<clblast::Triangle>(triangle),
+                                    static_cast<clblast::Transpose>(a_transpose),
+                                    static_cast<clblast::Diagonal>(diagonal),
+                                    n, k,
+                                    a_buffer, a_offset, a_ld,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // TPMV
 StatusCode CLBlastStpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -1228,6 +1467,21 @@ StatusCode CLBlastZtpmv(const Layout layout, const Triangle triangle, const Tran
                                        queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHtpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t n,
+                        const cl_mem ap_buffer, const size_t ap_offset,
+                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Tpmv<half>(static_cast<clblast::Layout>(layout),
+                                    static_cast<clblast::Triangle>(triangle),
+                                    static_cast<clblast::Transpose>(a_transpose),
+                                    static_cast<clblast::Diagonal>(diagonal),
+                                    n,
+                                    ap_buffer, ap_offset,
+                                    x_buffer, x_offset, x_inc,
+                                    queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // TRSV
 StatusCode CLBlastStrsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -1448,6 +1702,22 @@ StatusCode CLBlastDger(const Layout layout,
                              queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHger(const Layout layout,
+                       const size_t m, const size_t n,
+                       const cl_half alpha,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                       cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
+                             m, n,
+                             alpha,
+                             x_buffer, x_offset, x_inc,
+                             y_buffer, y_offset, y_inc,
+                             a_buffer, a_offset, a_ld,
+                             queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // GERU
 StatusCode CLBlastCgeru(const Layout layout,
@@ -1684,6 +1954,21 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
                              queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
+                       const size_t n,
+                       const cl_half alpha,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
+                             static_cast<clblast::Triangle>(triangle),
+                             n,
+                             alpha,
+                             x_buffer, x_offset, x_inc,
+                             a_buffer, a_offset, a_ld,
+                             queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SPR
 StatusCode CLBlastSspr(const Layout layout, const Triangle triangle,
@@ -1716,6 +2001,21 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
                              queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
+                       const size_t n,
+                       const cl_half alpha,
+                       const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                       cl_mem ap_buffer, const size_t ap_offset,
+                       cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
+                             static_cast<clblast::Triangle>(triangle),
+                             n,
+                             alpha,
+                             x_buffer, x_offset, x_inc,
+                             ap_buffer, ap_offset,
+                             queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SYR2
 StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle,
@@ -1752,6 +2052,23 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Triangle>(triangle),
+                              n,
+                              alpha,
+                              x_buffer, x_offset, x_inc,
+                              y_buffer, y_offset, y_inc,
+                              a_buffer, a_offset, a_ld,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SPR2
 StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle,
@@ -1788,6 +2105,23 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
+                        const size_t n,
+                        const cl_half alpha,
+                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        cl_mem ap_buffer, const size_t ap_offset,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Triangle>(triangle),
+                              n,
+                              alpha,
+                              x_buffer, x_offset, x_inc,
+                              y_buffer, y_offset, y_inc,
+                              ap_buffer, ap_offset,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
@@ -1874,6 +2208,26 @@ StatusCode CLBlastZgemm(const Layout layout, const Transpose a_transpose, const
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHgemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                        const size_t m, const size_t n, const size_t k,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        const cl_half beta,
+                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Gemm(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Transpose>(a_transpose),
+                              static_cast<clblast::Transpose>(b_transpose),
+                              m, n, k,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              b_buffer, b_offset, b_ld,
+                              beta,
+                              c_buffer, c_offset, c_ld,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // SYMM
 StatusCode CLBlastSsymm(const Layout layout, const Side side, const Triangle triangle,
@@ -1956,6 +2310,26 @@ StatusCode CLBlastZsymm(const Layout layout, const Side side, const Triangle tri
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsymm(const Layout layout, const Side side, const Triangle triangle,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        const cl_half beta,
+                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Symm(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Side>(side),
+                              static_cast<clblast::Triangle>(triangle),
+                              m, n,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              b_buffer, b_offset, b_ld,
+                              beta,
+                              c_buffer, c_offset, c_ld,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // HEMM
 StatusCode CLBlastChemm(const Layout layout, const Side side, const Triangle triangle,
@@ -2072,6 +2446,24 @@ StatusCode CLBlastZsyrk(const Layout layout, const Triangle triangle, const Tran
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                        const size_t n, const size_t k,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const cl_half beta,
+                        cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Syrk(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Triangle>(triangle),
+                              static_cast<clblast::Transpose>(a_transpose),
+                              n, k,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              beta,
+                              c_buffer, c_offset, c_ld,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // HERK
 StatusCode CLBlastCherk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
@@ -2192,6 +2584,26 @@ StatusCode CLBlastZsyr2k(const Layout layout, const Triangle triangle, const Tra
                                queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHsyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                         const size_t n, const size_t k,
+                         const cl_half alpha,
+                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const cl_half beta,
+                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Syr2k(static_cast<clblast::Layout>(layout),
+                               static_cast<clblast::Triangle>(triangle),
+                               static_cast<clblast::Transpose>(ab_transpose),
+                               n, k,
+                               alpha,
+                               a_buffer, a_offset, a_ld,
+                               b_buffer, b_offset, b_ld,
+                               beta,
+                               c_buffer, c_offset, c_ld,
+                               queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // HER2K
 StatusCode CLBlastCher2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
@@ -2308,6 +2720,24 @@ StatusCode CLBlastZtrmm(const Layout layout, const Side side, const Triangle tri
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHtrmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Trmm(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Side>(side),
+                              static_cast<clblast::Triangle>(triangle),
+                              static_cast<clblast::Transpose>(a_transpose),
+                              static_cast<clblast::Diagonal>(diagonal),
+                              m, n,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              b_buffer, b_offset, b_ld,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // TRSM
 StatusCode CLBlastStrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
@@ -2382,6 +2812,105 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri
                               queue, event);
   return static_cast<StatusCode>(status);
 }
+StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                        const size_t m, const size_t n,
+                        const cl_half alpha,
+                        const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                        cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Trsm(static_cast<clblast::Layout>(layout),
+                              static_cast<clblast::Side>(side),
+                              static_cast<clblast::Triangle>(triangle),
+                              static_cast<clblast::Transpose>(a_transpose),
+                              static_cast<clblast::Diagonal>(diagonal),
+                              m, n,
+                              alpha,
+                              a_buffer, a_offset, a_ld,
+                              b_buffer, b_offset, b_ld,
+                              queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// OMATCOPY
+StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const float alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  alpha,
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const double alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  alpha,
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_float2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  float2{alpha.s[0], alpha.s[1]},
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_double2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  double2{alpha.s[0], alpha.s[1]},
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_half alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  alpha,
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
 
 // =================================================================================================
 
diff --git a/include/internal/clpp11.h b/src/clpp11.hpp
index e70f9000..b834d8b4 100644
--- a/include/internal/clpp11.h
+++ b/src/clpp11.hpp
@@ -207,6 +207,12 @@ class Device {
     return true;
   }
 
+  // Query for a specific type of device or brand
+  bool IsCPU() const { return Type() == "CPU"; }
+  bool IsGPU() const { return Type() == "GPU"; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsARM() const { return Vendor() == "ARM"; }
+
   // Accessor to the private data-member
   const cl_device_id& operator()() const { return device_; }
  private:
diff --git a/src/database.cc b/src/database/database.cpp
index addd85d3..6ec93731 100644
--- a/src/database.cc
+++ b/src/database/database.cpp
@@ -11,33 +11,33 @@
 //
 // =================================================================================================
 
-#include "internal/database.h"
-#include "internal/database/xaxpy.h"
-#include "internal/database/xdot.h"
-#include "internal/database/xgemv.h"
-#include "internal/database/xger.h"
-#include "internal/database/xgemm.h"
-#include "internal/database/copy.h"
-#include "internal/database/pad.h"
-#include "internal/database/transpose.h"
-#include "internal/database/padtranspose.h"
-
-#include "internal/utilities.h"
+#include "utilities.hpp"
+
+#include "database/database.hpp"
+#include "database/kernels/xaxpy.hpp"
+#include "database/kernels/xdot.hpp"
+#include "database/kernels/xgemv.hpp"
+#include "database/kernels/xger.hpp"
+#include "database/kernels/xgemm.hpp"
+#include "database/kernels/copy.hpp"
+#include "database/kernels/pad.hpp"
+#include "database/kernels/transpose.hpp"
+#include "database/kernels/padtranspose.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // Initializes the database
 const std::vector<Database::DatabaseEntry> Database::database = {
-  XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
-  XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
-  XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
-  XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
-  XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
-  CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
-  PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
-  TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
-  PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+  XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
+  XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
+  XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
+  XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
+  XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+  CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
+  PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
+  TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
+  PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
 };
 
 // =================================================================================================
diff --git a/include/internal/database.h b/src/database/database.hpp
index ca79fdad..0987cbed 100644
--- a/include/internal/database.h
+++ b/src/database/database.hpp
@@ -21,7 +21,7 @@
 #include <vector>
 #include <unordered_map>
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -67,15 +67,15 @@ class Database {
   };
 
   // The database consists of separate database entries, stored together in a vector
-  static const DatabaseEntry XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
-  static const DatabaseEntry XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
-  static const DatabaseEntry XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
-  static const DatabaseEntry XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
-  static const DatabaseEntry XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
-  static const DatabaseEntry CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
-  static const DatabaseEntry PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
-  static const DatabaseEntry TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
-  static const DatabaseEntry PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
+  static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
+  static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
+  static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
+  static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
+  static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+  static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
+  static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
+  static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
+  static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
   static const std::vector<DatabaseEntry> database;
 
   // The constructor
diff --git a/include/internal/database/copy.h b/src/database/kernels/copy.hpp
index 59a9e03a..14946af4 100644
--- a/include/internal/database/copy.h
+++ b/src/database/kernels/copy.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::CopyHalf = {
+  "Copy", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::CopySingle = {
   "Copy", Precision::kSingle, {
     { // AMD GPUs
@@ -41,9 +59,11 @@ const Database::DatabaseEntry Database::CopySingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -54,6 +74,7 @@ const Database::DatabaseEntry Database::CopySingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@@ -96,9 +117,11 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -109,6 +132,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -161,6 +185,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@@ -215,6 +240,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/include/internal/database/pad.h b/src/database/kernels/pad.hpp
index d2de19e4..db4df9f0 100644
--- a/include/internal/database/pad.h
+++ b/src/database/kernels/pad.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::PadHalf = {
+  "Pad", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::PadSingle = {
   "Pad", Precision::kSingle, {
     { // AMD GPUs
@@ -41,9 +59,11 @@ const Database::DatabaseEntry Database::PadSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -54,6 +74,7 @@ const Database::DatabaseEntry Database::PadSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
@@ -67,7 +88,7 @@ const Database::DatabaseEntry Database::PadSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
@@ -102,9 +123,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -115,6 +138,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -169,6 +193,7 @@ const Database::DatabaseEntry Database::PadDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -223,6 +248,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/include/internal/database/padtranspose.h b/src/database/kernels/padtranspose.hpp
index b1db1b21..7fedd15a 100644
--- a/include/internal/database/padtranspose.h
+++ b/src/database/kernels/padtranspose.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::PadtransposeHalf = {
+  "Padtranspose", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::PadtransposeSingle = {
   "Padtranspose", Precision::kSingle, {
     { // AMD GPUs
@@ -41,6 +59,8 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -54,6 +74,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@@ -102,6 +123,8 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -115,6 +138,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -169,6 +193,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
@@ -223,6 +248,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
diff --git a/include/internal/database/transpose.h b/src/database/kernels/transpose.hpp
index d87f79a6..4229e39f 100644
--- a/include/internal/database/transpose.h
+++ b/src/database/kernels/transpose.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::TransposeHalf = {
+  "Transpose", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::TransposeSingle = {
   "Transpose", Precision::kSingle, {
     { // AMD GPUs
@@ -41,9 +59,11 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -54,6 +74,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -102,6 +123,8 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -109,6 +132,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -163,6 +187,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -211,6 +236,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
diff --git a/include/internal/database/xaxpy.h b/src/database/kernels/xaxpy.hpp
index 55be0bcb..d8088ca2 100644
--- a/include/internal/database/xaxpy.h
+++ b/src/database/kernels/xaxpy.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XaxpyHalf = {
+  "Xaxpy", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+        { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XaxpySingle = {
   "Xaxpy", Precision::kSingle, {
     { // AMD GPUs
@@ -41,6 +59,8 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -54,6 +74,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",4}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
@@ -102,9 +123,11 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -115,6 +138,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
@@ -169,6 +193,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -223,6 +248,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
diff --git a/include/internal/database/xdot.h b/src/database/kernels/xdot.hpp
index f9ae975b..48288f95 100644
--- a/include/internal/database/xdot.h
+++ b/src/database/kernels/xdot.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XdotHalf = {
+  "Xdot", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XdotSingle = {
   "Xdot", Precision::kSingle, {
     { // AMD GPUs
@@ -33,12 +51,15 @@ const Database::DatabaseEntry Database::XdotSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
@@ -49,7 +70,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
       }
     },
   }
@@ -76,18 +97,21 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
       }
     },
     { // Default
@@ -119,6 +143,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
@@ -156,6 +181,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
diff --git a/include/internal/database/xgemm.h b/src/database/kernels/xgemm.hpp
index 9ca2bff5..27cebc8a 100644
--- a/include/internal/database/xgemm.h
+++ b/src/database/kernels/xgemm.hpp
@@ -14,6 +14,18 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XgemmHalf = {
+  "Xgemm", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XgemmSingle = {
   "Xgemm", Precision::kSingle, {
     { // AMD GPUs
@@ -41,9 +53,11 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Intel accelerators
@@ -54,6 +68,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
@@ -62,7 +77,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
         { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
       }
     },
     { // Default
@@ -102,9 +117,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Intel accelerators
@@ -115,6 +132,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
@@ -169,6 +187,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
@@ -177,7 +196,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K40m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Default
@@ -223,6 +242,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
diff --git a/include/internal/database/xgemv.h b/src/database/kernels/xgemv.hpp
index bbbe62f6..ce258f2f 100644
--- a/include/internal/database/xgemv.h
+++ b/src/database/kernels/xgemv.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XgemvHalf = {
+  "Xgemv", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
@@ -34,9 +52,11 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
         { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
       }
     },
     { // Intel accelerators
@@ -47,6 +67,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
         { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
@@ -88,6 +109,8 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
         { "Iris",                                            { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
@@ -101,6 +124,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -144,6 +168,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
         { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
@@ -191,6 +216,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
       }
diff --git a/include/internal/database/xger.h b/src/database/kernels/xger.hpp
index dae857cd..3727cc57 100644
--- a/include/internal/database/xger.h
+++ b/src/database/kernels/xger.hpp
@@ -14,6 +14,24 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XgerHalf = {
+  "Xger", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XgerSingle = {
   "Xger", Precision::kSingle, {
     { // AMD GPUs
@@ -40,12 +58,15 @@ const Database::DatabaseEntry Database::XgerSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -54,7 +75,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
       }
     },
   }
@@ -88,12 +109,15 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
@@ -136,6 +160,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
@@ -178,6 +203,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index b9e52e17..08c47d87 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -19,20 +19,36 @@ R"(
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this file is used outside of the CLBlast library.
 #ifndef PRECISION
-  #define PRECISION 32      // Data-types: single or double precision, complex or regular
+  #define PRECISION 32      // Data-types: half, single or double precision, complex or regular
 #endif
 
 // =================================================================================================
 
 // Enable support for double-precision
+#if PRECISION == 16
+  #pragma OPENCL EXTENSION cl_khr_fp16: enable
+#endif
+
+// Enable support for double-precision
 #if PRECISION == 64 || PRECISION == 6464
   #if __OPENCL_VERSION__ <= CL_VERSION_1_1
      #pragma OPENCL EXTENSION cl_khr_fp64: enable
   #endif
 #endif
 
+// Half-precision
+#if PRECISION == 16
+  typedef half real;
+  typedef half2 real2;
+  typedef half4 real4;
+  typedef half8 real8;
+  typedef half16 real16;
+  #define ZERO 0
+  #define ONE 1
+  #define SMALLEST -1.0e14
+
 // Single-precision
-#if PRECISION == 32
+#elif PRECISION == 32
   typedef float real;
   typedef float2 real2;
   typedef float4 real4;
@@ -68,7 +84,7 @@ R"(
   #define ONE 1.0f
   #define SMALLEST -1.0e37f
 
-// Complex Double-precision
+// Complex double-precision
 #elif PRECISION == 6464
   typedef struct cdouble {double x; double y;} real;
   typedef struct cdouble2 {real x; real y;} real2;
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index 574beb43..e0efadc1 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -23,9 +23,10 @@ R"(
 
 // Full version of the kernel with offsets and strided accesses
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const real alpha,
+__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
+  const real alpha = arg_alpha[0];
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,9 +41,11 @@ __kernel void Xaxpy(const int n, const real alpha,
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const real alpha,
+__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
                         const __global realV* restrict xgm,
                         __global realV* ygm) {
+  const real alpha = arg_alpha[0];
+
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 30b131b4..65b4291f 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -211,13 +211,17 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 
 // Full version of the kernel
 __attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
+__kernel void Xgemv(const int m, const int n,
+                    const __constant real* restrict arg_alpha,
+                    const __constant real* restrict arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 61fdffa3..6a494e84 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -95,13 +95,18 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n, const real alpha, const real beta,
+__kernel void XgemvFast(const int m, const int n,
+                        const __constant real* restrict arg_alpha,
+                        const __constant real* restrict arg_beta,
                         const int a_rotated,
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
                         const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
+
   // Local memory for the vector X
   __local real xlm[WGS2];
 
@@ -192,13 +197,18 @@ __kernel void XgemvFast(const int m, const int n, const real alpha, const real b
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n, const real alpha, const real beta,
+__kernel void XgemvFastRot(const int m, const int n,
+                           const __constant real* restrict arg_alpha,
+                           const __constant real* restrict arg_beta,
                            const int a_rotated,
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
                            const int kl, const int ku) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
+
   // Local memory for the vector X
   __local real xlm[WGS3];
 
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index d377fbb0..63817afb 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -19,11 +19,13 @@ R"(
 
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2, const real alpha,
+__kernel void Xger(const int max1, const int max2,
+                   const __constant real* restrict arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    const __global real* ygm, const int y_offset, const int y_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_rowmajor) {
+  const real alpha = arg_alpha[0];
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index edb94ca8..fc635f2e 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -19,10 +19,12 @@ R"(
 
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n, const real alpha,
+__kernel void Xher(const int n,
+                   const __constant real* restrict arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_upper, const int is_rowmajor) {
+  const real alpha = arg_alpha[0];
 
   // Register storage for X and XT
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index 4a2edce8..a66f255f 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -19,11 +19,13 @@ R"(
 
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n, const real alpha,
+__kernel void Xher2(const int n,
+                    const __constant real* restrict arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     const __global real* restrict ygm, const int y_offset, const int y_inc,
                     __global real* restrict agm, const int a_offset, const int a_ld,
                     const int is_upper, const int is_rowmajor) {
+  const real alpha = arg_alpha[0];
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
new file mode 100644
index 00000000..53cc161a
--- /dev/null
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -0,0 +1,106 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to convert hermitian matrices to/from general matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464)
+
+// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void HermUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-hermitian matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) {
+            result = src[id_two*src_ld + id_one + src_offset];
+            if (id_one == id_two) { result.y = ZERO; }
+          }
+          else {
+            result = src[id_one*src_ld + id_two + src_offset];
+            COMPLEX_CONJUGATE(result);
+          }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
new file mode 100644
index 00000000..c6ce93ca
--- /dev/null
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to convert symmetric matrices to/from general matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_SYMM)
+
+// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-symmetric matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void SymmUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-symmetric matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
new file mode 100644
index 00000000..fdd2461a
--- /dev/null
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -0,0 +1,98 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains kernels to convert triangular matrices to/from general matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+#if defined(ROUTINE_TRMM)
+
+// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
+// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TriaLowerToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the lower-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+// Same as above, but now the matrix' data is stored in the upper-triangle
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void TriaUpperToSquared(const int src_dim,
+                                 const int src_ld, const int src_offset,
+                                 __global const real* restrict src,
+                                 const int dest_dim,
+                                 const int dest_ld, const int dest_offset,
+                                 __global real* dest,
+                                 const int unit_diagonal) {
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_dim && id_one < dest_dim) {
+
+        // Loads data from the upper-triangular matrix
+        real result;
+        SetToZero(result);
+        if (id_two < src_dim && id_one < src_dim) {
+          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
+          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
+          // Else: result is zero
+        }
+
+        // Stores the result in the destination matrix
+        dest[id_two*dest_ld + id_one + dest_offset] = result;
+      }
+    }
+  }
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
new file mode 100644
index 00000000..09e54e6d
--- /dev/null
+++ b/src/kernels/level3/copy_fast.opencl
@@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+  typedef real realC;
+#elif COPY_VW == 2
+  typedef real2 realC;
+#elif COPY_VW == 4
+  typedef real4 realC;
+#elif COPY_VW == 8
+  typedef real8 realC;
+#elif COPY_VW == 16
+  typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrixFast(const int ld,
+                             __global const realC* restrict src,
+                             __global realC* dest,
+                             const __constant real* restrict arg_alpha) {
+  const real alpha = arg_alpha[0];
+  #pragma unroll
+  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+    const int id_one = get_global_id(0);
+    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+    const int id = id_two*(ld/COPY_VW) + id_one;
+    realC result;
+    #if COPY_VW == 1
+      Multiply(result, alpha, src[id]);
+    #elif COPY_VW == 2
+      Multiply(result.x, alpha, src[id].x);
+      Multiply(result.y, alpha, src[id].y);
+    #elif COPY_VW == 4
+      Multiply(result.x, alpha, src[id].x);
+      Multiply(result.y, alpha, src[id].y);
+      Multiply(result.z, alpha, src[id].z);
+      Multiply(result.w, alpha, src[id].w);
+    #elif COPY_VW == 8
+      Multiply(result.s0, alpha, src[id].s0);
+      Multiply(result.s1, alpha, src[id].s1);
+      Multiply(result.s2, alpha, src[id].s2);
+      Multiply(result.s3, alpha, src[id].s3);
+      Multiply(result.s4, alpha, src[id].s4);
+      Multiply(result.s5, alpha, src[id].s5);
+      Multiply(result.s6, alpha, src[id].s6);
+      Multiply(result.s7, alpha, src[id].s7);
+    #elif COPY_VW == 16
+      Multiply(result.s0, alpha, src[id].s0);
+      Multiply(result.s1, alpha, src[id].s1);
+      Multiply(result.s2, alpha, src[id].s2);
+      Multiply(result.s3, alpha, src[id].s3);
+      Multiply(result.s4, alpha, src[id].s4);
+      Multiply(result.s5, alpha, src[id].s5);
+      Multiply(result.s6, alpha, src[id].s6);
+      Multiply(result.s7, alpha, src[id].s7);
+      Multiply(result.s8, alpha, src[id].s8);
+      Multiply(result.s9, alpha, src[id].s9);
+      Multiply(result.sA, alpha, src[id].sA);
+      Multiply(result.sB, alpha, src[id].sB);
+      Multiply(result.sC, alpha, src[id].sC);
+      Multiply(result.sD, alpha, src[id].sD);
+      Multiply(result.sE, alpha, src[id].sE);
+      Multiply(result.sF, alpha, src[id].sF);
+    #endif
+    dest[id] = result;;
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
new file mode 100644
index 00000000..d276cc60
--- /dev/null
+++ b/src/kernels/level3/copy_pad.opencl
@@ -0,0 +1,113 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS functions. This file contains
+// kernels to copy and pad matrices in various ways, including:
+// 1) copying into a larger matrix by adding padding
+// 2) copying into a smaller matrix by optionally removing padding. This is the general version
+//    without restrictions, see the 'copy.opencl' file for a faster but more restricted copy kernel.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Copies a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
+// value and offset can be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void CopyPadMatrix(const int src_one, const int src_two,
+                            const int src_ld, const int src_offset,
+                            __global const real* restrict src,
+                            const int dest_one, const int dest_two,
+                            const int dest_ld, const int dest_offset,
+                            __global real* dest,
+                            const __constant real* restrict arg_alpha,
+                            const int do_conjugate) {
+  const real alpha = arg_alpha[0];
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+      if (id_two < dest_two && id_one < dest_one) {
+
+        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
+        // value to be written to zero.
+        real value;
+        SetToZero(value);
+        if (id_two < src_two && id_one < src_one) {
+          value = src[id_two*src_ld + id_one + src_offset];
+        }
+
+        // Stores the value in the destination matrix
+        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
+        Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
+// writes only the actual data back to the destination matrix. Again, the ld value and offset can
+// be different.
+__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+__kernel void CopyMatrix(const int src_one, const int src_two,
+                         const int src_ld, const int src_offset,
+                         __global const real* restrict src,
+                         const int dest_one, const int dest_two,
+                         const int dest_ld, const int dest_offset,
+                         __global real* dest,
+                         const __constant real* restrict arg_alpha,
+                         const int upper, const int lower,
+                         const int diagonal_imag_zero) {
+  const real alpha = arg_alpha[0];
+
+  // Loops over the work per thread in both dimensions
+  #pragma unroll
+  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
+    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
+    #pragma unroll
+    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
+      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
+
+      // Masking in case of triangular matrices: updates only the upper or lower part
+      bool condition = true;
+      #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+        if (upper == 1) { condition = (id_two >= id_one); }
+        else if (lower == 1) { condition = (id_two <= id_one); }
+      #endif
+      if (condition) {
+
+        // Copies the value into the destination matrix. This is always within bounds of the source
+        // matrix, as we know that the destination matrix is smaller or equal to the source.
+        if (id_two < dest_two && id_one < dest_one) {
+          real value = src[id_two*src_ld + id_one + src_offset];
+          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
+          Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value);
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/copy.opencl b/src/kernels/level3/level3.opencl
index 7dde688b..bf14ab12 100644
--- a/src/kernels/level3/copy.opencl
+++ b/src/kernels/level3/level3.opencl
@@ -7,8 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains the common kernels shared among different BLAS routines. This file contains
-// kernels to copy matrices.
+// This file contains the common functions and parameters specific for level 3 BLAS kernels.
 //
 // =================================================================================================
 
@@ -20,6 +19,8 @@ R"(
 
 // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 // this kernel file is used outside of the CLBlast library.
+
+// For the 'fast' copy kernel
 #ifndef COPY_DIMX
   #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
 #endif
@@ -33,37 +34,44 @@ R"(
   #define COPY_VW 1        // Vector width in the second dimension (y)
 #endif
 
-// =================================================================================================
-
-// Data-widths
-#if COPY_VW == 1
-  typedef real realC;
-#elif COPY_VW == 2
-  typedef real2 realC;
-#elif COPY_VW == 4
-  typedef real4 realC;
-#elif COPY_VW == 8
-  typedef real8 realC;
-#elif COPY_VW == 16
-  typedef real16 realC;
+// For the padding/copy kernels and the conversion kernels
+#ifndef PAD_DIMX
+  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
+#endif
+#ifndef PAD_DIMY
+  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
+#endif
+#ifndef PAD_WPTX
+  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
+#endif
+#ifndef PAD_WPTY
+  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
 #endif
 
-// =================================================================================================
+// For the 'fast' transpose kernel
+#ifndef TRA_DIM
+  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef TRA_WPT
+  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
+#endif
+#ifndef TRA_PAD
+  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
+#endif
+#ifndef TRA_SHUFFLE
+  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
+#endif
 
-// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
-// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrix(const int ld,
-                         __global const realC* restrict src,
-                         __global realC* dest) {
-  #pragma unroll
-  for (int w_one=0; w_one<COPY_WPT; ++w_one) {
-    const int id_one = get_global_id(0);
-    const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
-    const int id = id_two*(ld/COPY_VW) + id_one;
-    dest[id] = src[id];
-  }
-}
+// For the padding/transpose kernels
+#ifndef PADTRA_TILE
+  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
+#endif
+#ifndef PADTRA_WPT
+  #define PADTRA_WPT 1    // Amount of work per thread
+#endif
+#ifndef PADTRA_PAD
+  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
+#endif
 
 // =================================================================================================
 
diff --git a/src/kernels/level3/pad.opencl b/src/kernels/level3/pad.opencl
deleted file mode 100644
index eefddce4..00000000
--- a/src/kernels/level3/pad.opencl
+++ /dev/null
@@ -1,353 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file contains the common kernels shared among different BLAS routines. This file contains
-// kernels to copy and pad matrices in various ways, including:
-// 1) copying into a larger matrix by adding padding
-// 2) copying into a smaller matrix by removing padding
-// 3) from upper/lower triangle into a full matrix
-//
-// =================================================================================================
-
-// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
-// literal). Comment-out this line for syntax-highlighting when developing.
-R"(
-
-// =================================================================================================
-
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef PAD_DIMX
-  #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
-#endif
-#ifndef PAD_DIMY
-  #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
-#endif
-#ifndef PAD_WPTX
-  #define PAD_WPTX 1      // Work per thread in the first dimension (x)
-#endif
-#ifndef PAD_WPTY
-  #define PAD_WPTY 1      // Work per thread in the second dimension (y)
-#endif
-
-// =================================================================================================
-
-// Copies a matrix from source to destination. The output is padded with zero values in case the
-// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
-// value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void PadMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const int do_conjugate) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_two && id_one < dest_one) {
-
-        // Loads data if the thread IDs are within bounds of the source matrix. Otherwise, set the
-        // value to be written to zero.
-        real value;
-        SetToZero(value);
-        if (id_two < src_two && id_one < src_one) {
-          value = src[id_two*src_ld + id_one + src_offset];
-        }
-
-        // Stores the value in the destination matrix
-        if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
-      }
-    }
-  }
-}
-
-// =================================================================================================
-
-// Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
-// writes only the actual data back to the destination matrix. Again, the ld value and offset can
-// be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void UnPadMatrix(const int src_one, const int src_two,
-                          const int src_ld, const int src_offset,
-                          __global const real* restrict src,
-                          const int dest_one, const int dest_two,
-                          const int dest_ld, const int dest_offset,
-                          __global real* dest,
-                          const int upper, const int lower,
-                          const int diagonal_imag_zero) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-
-      // Masking in case of triangular matrices: updates only the upper or lower part
-      bool condition = true;
-      if (upper == 1) { condition = (id_two >= id_one); }
-      else if (lower == 1) { condition = (id_two <= id_one); }
-      if (condition) {
-
-        // Copies the value into the destination matrix. This is always within bounds of the source
-        // matrix, as we know that the destination matrix is smaller than the source.
-        if (id_two < dest_two && id_one < dest_one) {
-          real value = src[id_two*src_ld + id_one + src_offset];
-          if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
-          dest[id_two*dest_ld + id_one + dest_offset] = value;
-        }
-      }
-    }
-  }
-}
-
-// =================================================================================================
-#if defined(ROUTINE_SYMM)
-
-// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-symmetric matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-symmetric matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          else                  { result = src[id_one*src_ld + id_two + src_offset]; }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464)
-
-// Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-hermitian matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) {
-            result = src[id_two*src_ld + id_one + src_offset];
-            if (id_one == id_two) { result.y = ZERO; }
-          }
-          else {
-            result = src[id_one*src_ld + id_two + src_offset];
-            COMPLEX_CONJUGATE(result);
-          }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-hermitian matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) {
-            result = src[id_two*src_ld + id_one + src_offset];
-            if (id_one == id_two) { result.y = ZERO; }
-          }
-          else {
-            result = src[id_one*src_ld + id_two + src_offset];
-            COMPLEX_CONJUGATE(result);
-          }
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-#if defined(ROUTINE_TRMM)
-
-// Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
-// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TrmmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the lower-triangular matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_two <= id_one) { result = src[id_two*src_ld + id_one + src_offset]; }
-          if (id_two == id_one && unit_diagonal) { SetToOne(result); }
-          // Else: result is zero
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-// Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TrmmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
-
-  // Loops over the work per thread in both dimensions
-  #pragma unroll
-  for (int w_one=0; w_one<PAD_WPTX; ++w_one) {
-    const int id_one = (get_group_id(0)*PAD_WPTX + w_one) * PAD_DIMX + get_local_id(0);
-    #pragma unroll
-    for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
-      const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
-      if (id_two < dest_dim && id_one < dest_dim) {
-
-        // Loads data from the upper-triangular matrix
-        real result;
-        SetToZero(result);
-        if (id_two < src_dim && id_one < src_dim) {
-          if (id_one <= id_two) { result = src[id_two*src_ld + id_one + src_offset]; }
-          if (id_one == id_two && unit_diagonal) { SetToOne(result); }
-          // Else: result is zero
-        }
-
-        // Stores the result in the destination matrix
-        dest[id_two*dest_ld + id_one + dest_offset] = result;
-      }
-    }
-  }
-}
-
-#endif
-// =================================================================================================
-
-// End of the C++11 raw string literal
-)"
-
-// =================================================================================================
diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose_fast.opencl
index d726f7ec..d5c46a30 100644
--- a/src/kernels/level3/transpose.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -8,7 +8,8 @@
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
 // This file contains the common kernels shared among different BLAS functions. This file contains
-// kernels to transpose matrices.
+// a kernel to transpose matrices. This is a 'fast' version with restrictions, see the
+// 'padtranspose.opencl' file for a general transpose kernel.
 //
 // =================================================================================================
 
@@ -17,22 +18,6 @@
 R"(
 
 // =================================================================================================
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef TRA_DIM
-  #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
-#endif
-#ifndef TRA_WPT
-  #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
-#endif
-#ifndef TRA_PAD
-  #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
-#endif
-#ifndef TRA_SHUFFLE
-  #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
-#endif
-
-// =================================================================================================
 
 // Data-widths
 #if TRA_WPT == 1
@@ -52,9 +37,11 @@ R"(
 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
 __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrix(const int ld,
-                              __global const realT* restrict src,
-                              __global realT* dest) {
+__kernel void TransposeMatrixFast(const int ld,
+                                  __global const realT* restrict src,
+                                  __global realT* dest,
+                                  const __constant real* restrict arg_alpha) {
+  const real alpha = arg_alpha[0];
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
@@ -132,12 +119,50 @@ __kernel void TransposeMatrix(const int ld,
     results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
   #endif
 
-  // Stores the results into the destination matrix
+  // Multiplies by alpha and then stores the results into the destination matrix
   #pragma unroll
   for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+    realT result;
+    #if TRA_WPT == 1
+      Multiply(result, alpha, results[w_two]);
+    #elif TRA_WPT == 2
+      Multiply(result.x, alpha, results[w_two].x);
+      Multiply(result.y, alpha, results[w_two].y);
+    #elif TRA_WPT == 4
+      Multiply(result.x, alpha, results[w_two].x);
+      Multiply(result.y, alpha, results[w_two].y);
+      Multiply(result.z, alpha, results[w_two].z);
+      Multiply(result.w, alpha, results[w_two].w);
+    #elif TRA_WPT == 8
+      Multiply(result.s0, alpha, results[w_two].s0);
+      Multiply(result.s1, alpha, results[w_two].s1);
+      Multiply(result.s2, alpha, results[w_two].s2);
+      Multiply(result.s3, alpha, results[w_two].s3);
+      Multiply(result.s4, alpha, results[w_two].s4);
+      Multiply(result.s5, alpha, results[w_two].s5);
+      Multiply(result.s6, alpha, results[w_two].s6);
+      Multiply(result.s7, alpha, results[w_two].s7);
+    #elif TRA_WPT == 16
+      Multiply(result.s0, alpha, results[w_two].s0);
+      Multiply(result.s1, alpha, results[w_two].s1);
+      Multiply(result.s2, alpha, results[w_two].s2);
+      Multiply(result.s3, alpha, results[w_two].s3);
+      Multiply(result.s4, alpha, results[w_two].s4);
+      Multiply(result.s5, alpha, results[w_two].s5);
+      Multiply(result.s6, alpha, results[w_two].s6);
+      Multiply(result.s7, alpha, results[w_two].s7);
+      Multiply(result.s8, alpha, results[w_two].s8);
+      Multiply(result.s9, alpha, results[w_two].s9);
+      Multiply(result.sA, alpha, results[w_two].sA);
+      Multiply(result.sB, alpha, results[w_two].sB);
+      Multiply(result.sC, alpha, results[w_two].sC);
+      Multiply(result.sD, alpha, results[w_two].sD);
+      Multiply(result.sE, alpha, results[w_two].sE);
+      Multiply(result.sF, alpha, results[w_two].sF);
+    #endif
     const int id_one = gid0*TRA_DIM + get_local_id(0);
     const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
-    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
+    dest[id_two*(ld/TRA_WPT) + id_one] = result;
   }
 }
 
diff --git a/src/kernels/level3/padtranspose.opencl b/src/kernels/level3/transpose_pad.opencl
index a6b70f0b..2de0c7bd 100644
--- a/src/kernels/level3/padtranspose.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -10,7 +10,9 @@
 // This file contains the common kernels shared among different BLAS functions. This file contains
 // kernels to transpose matrices in various ways, including:
 // 1) transposing into a larger matrix by adding padding
-// 2) transposing into a smaller matrix by removing padding
+// 2) transposing into a smaller matrix by optionally removing padding. This is the general version
+//    without restrictions, see the 'transpose.opencl' file for a faster but more restricted
+//    transpose kernel.
 //
 // =================================================================================================
 
@@ -19,29 +21,19 @@
 R"(
 
 // =================================================================================================
-// Parameters set by the tuner or by the database. Here they are given a basic default value in case
-// this kernel file is used outside of the CLBlast library.
-#ifndef PADTRA_TILE
-  #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
-#endif
-#ifndef PADTRA_WPT
-  #define PADTRA_WPT 1    // Amount of work per thread
-#endif
-#ifndef PADTRA_PAD
-  #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
-#endif
 
-// =================================================================================================
-
-// Same as PadCopyMatrix, but now also does the transpose
+// Transposes a matrix from source to destination. The output is padded with zero values in case the
+// destination matrix dimensions are larger than the transposed source matrix dimensions.
 __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void PadTransposeMatrix(const int src_one, const int src_two,
+__kernel void TransposePadMatrix(const int src_one, const int src_two,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_one, const int dest_two,
                                  const int dest_ld, const int dest_offset,
                                  __global real* dest,
+                                 const __constant real* restrict arg_alpha,
                                  const int do_conjugate) {
+  const real alpha = arg_alpha[0];
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -85,7 +77,7 @@ __kernel void PadTransposeMatrix(const int src_one, const int src_two,
       if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
         real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
         if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
       }
     }
   }
@@ -93,16 +85,20 @@ __kernel void PadTransposeMatrix(const int src_one, const int src_two,
 
 // =================================================================================================
 
-// Same as UnPadCopyMatrix, but now also does the transpose
+// Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
+// padded source matrix, but only the actual data is written back to the transposed destination
+// matrix. This kernel optionally checks for upper/lower triangular matrices.
 __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
-                                   const int src_ld, const int src_offset,
-                                   __global const real* restrict src,
-                                   const int dest_one, const int dest_two,
-                                   const int dest_ld, const int dest_offset,
-                                   __global real* dest,
-                                   const int upper, const int lower,
-                                   const int diagonal_imag_zero) {
+__kernel void TransposeMatrix(const int src_one, const int src_two,
+                              const int src_ld, const int src_offset,
+                              __global const real* restrict src,
+                              const int dest_one, const int dest_two,
+                              const int dest_ld, const int dest_offset,
+                              __global real* dest,
+                              const __constant real* restrict arg_alpha,
+                              const int upper, const int lower,
+                              const int diagonal_imag_zero) {
+  const real alpha = arg_alpha[0];
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -141,15 +137,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
 
       // Masking in case of triangular matrices: updates only the upper or lower part
       bool condition = true;
-      if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
-      else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+        if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+        else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+      #endif
       if (condition) {
 
         // Stores the transposed value in the destination matrix
         if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
           real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
           if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
-          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+          Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
         }
       }
     }
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index a2a555de..1ad0a558 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -7,10 +7,10 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto
+// This file contains an optimized matrix-multiplication kernel inspired by the paper by Matsumoto
 // et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable
 // (and tunable!) using more or less the same parameters/naming conventions as in the paper. It
-// supports single and double precision (SGEMM/DGEMM) through a pre-processor define.
+// supports different data-types (SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM) through a pre-processor define.
 //
 // Matrices are accessed as follows:
 // A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m)
@@ -31,7 +31,7 @@
 //    o-------o        o-----o  
 //                              
 //
-// This kernel is seperated into two files. This is part 1 out of 2,
+// This kernel is seperated into two files. This is part 1 out of 2.
 //
 // =================================================================================================
 
@@ -68,7 +68,7 @@ R"(
   #define KWI 1      // Unroll factor of the KWG loop (smaller or equal than KWG)
 #endif
 #ifndef VWM
-  #define VWM 1      // Vector width of matrices A and C 
+  #define VWM 1      // Vector width of matrices A and C
 #endif
 #ifndef VWN
   #define VWN 1      // Vector width of matrix B
@@ -97,7 +97,12 @@ R"(
 #define NWB (NWG/NDIMB)               // Amount of loads-per-thread for matrix B (N-dimension)
 
 // Settings
-#define USE_VECTOR_MAD 0              // Unroll (0) or don't (1) unroll the vector MAD manually
+#ifndef USE_VECTOR_MAD
+  #define USE_VECTOR_MAD 0      // Unroll (0) or don't (1) unroll the vector MAD manually
+#endif
+#ifndef GLOBAL_MEM_FENCE
+  #define GLOBAL_MEM_FENCE 0    // Global synchronisation barrier for potential better performance
+#endif
 
 // =================================================================================================
 
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 599e01d5..42c1127c 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -258,6 +258,9 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
       barrier(CLK_LOCAL_MEM_FENCE);
     #endif
   }
+  #if GLOBAL_MEM_FENCE == 1
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  #endif
 }
 
 // =================================================================================================
@@ -267,10 +270,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the upper-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const real alpha, const real beta,
+                         const __constant real* restrict arg_alpha,
+                         const __constant real* restrict arg_beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
 
   // Skip these threads if they do not contain threads contributing to the upper-triangle
   if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
@@ -304,10 +310,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the lower-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const real alpha, const real beta,
+                         const __constant real* restrict arg_alpha,
+                         const __constant real* restrict arg_beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
 
   // Skip these threads if they do not contain threads contributing to the lower-triangle
   if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
@@ -345,10 +354,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the regular full version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const real alpha, const real beta,
+                    const __constant real* restrict arg_alpha,
+                    const __constant real* restrict arg_beta,
                     const __global realM* restrict agm,
                     const __global realN* restrict bgm,
                     __global realM* cgm) {
+  const real alpha = arg_alpha[0];
+  const real beta = arg_beta[0];
 
   // Allocates workgroup-private memory (local memory)
   #if SA == 1
diff --git a/include/internal/public_api.h b/src/public_api.hpp
index d0732297..d0732297 100644
--- a/include/internal/public_api.h
+++ b/src/public_api.hpp
diff --git a/src/routine.cc b/src/routine.cc
deleted file mode 100644
index eee4c7cc..00000000
--- a/src/routine.cc
+++ /dev/null
@@ -1,415 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Routine base class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: not much here, because no status codes can be returned
-template <typename T>
-Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
-                    const std::vector<std::string> &routines, const Precision precision):
-    precision_(precision),
-    routine_name_(name),
-    queue_(queue),
-    event_(event),
-    context_(queue_.GetContext()),
-    device_(queue_.GetDevice()),
-    device_name_(device_.Name()),
-    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
-    max_work_item_sizes_(device_.MaxWorkItemSizes()),
-    max_work_group_size_(device_.MaxWorkGroupSize()),
-    db_(queue_, routines, precision_) {
-}
-
-// =================================================================================================
-
-// Separate set-up function to allow for status codes to be returned
-template <typename T>
-StatusCode Routine<T>::SetUp() {
-
-  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache()) { return StatusCode::kSuccess; }
-
-  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
-  // is, a program is created and stored in the cache
-  if (BinaryIsInCache()) {
-    try {
-      auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
-      auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
-      program.Build(device_, options);
-      StoreProgramToCache(program);
-    } catch (...) { return StatusCode::kBuildProgramFailure; }
-    return StatusCode::kSuccess;
-  }
-
-  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
-  // program will be added to the cache.
-
-  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
-  auto extensions = device_.Capabilities();
-  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
-    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      return StatusCode::kNoDoublePrecision;
-    }
-  }
-
-  // As above, but for cl_khr_fp16 (half precision)
-  if (precision_ == Precision::kHalf) {
-    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      return StatusCode::kNoHalfPrecision;
-    }
-  }
-
-  // Loads the common header (typedefs and defines and such)
-  std::string common_header =
-    #include "kernels/common.opencl"
-  ;
-
-  // Collects the parameters for this device in the form of defines, and adds the precision
-  auto defines = db_.GetDefines();
-  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-  // Adds the name of the routine as a define
-  defines += "#define ROUTINE_"+routine_name_+"\n";
-
-  // Determines whether this is a specific device
-  const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc.";
-  const auto isGPU = device_.Type() == "GPU";
-
-  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (isAMD && isGPU) {
-    defines += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (isAMD && isGPU) {
-    defines += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // Combines everything together into a single source string
-  auto source_string = defines + common_header + source_string_;
-
-  // Compiles the kernel
-  try {
-    auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
-    auto build_status = program.Build(device_, options);
-
-    // Checks for compiler crashes/errors/warnings
-    if (build_status == BuildStatus::kError) {
-      auto message = program.GetBuildInfo(device_);
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-      return StatusCode::kBuildProgramFailure;
-    }
-    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
-
-    // Store the compiled binary and program in the cache
-    const auto binary = program.GetIR();
-    StoreBinaryToCache(binary);
-    StoreProgramToCache(program);
-  } catch (...) { return StatusCode::kBuildProgramFailure; }
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Enqueues a kernel, waits for completion, and checks for errors
-template <typename T>
-StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                                 const std::vector<size_t> &local, EventPointer event,
-                                 std::vector<Event>& waitForEvents) {
-
-  // Tests for validity of the local thread sizes
-  if (local.size() > max_work_item_dimensions_) {
-    return StatusCode::kInvalidLocalNumDimensions; 
-  }
-  for (auto i=size_t{0}; i<local.size(); ++i) {
-    if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
-  }
-  auto local_size = size_t{1};
-  for (auto &item: local) { local_size *= item; }
-  if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
-
-  // Make sure the global thread sizes are at least equal to the local sizes
-  for (auto i=size_t{0}; i<global.size(); ++i) {
-    if (global[i] < local[i]) { global[i] = local[i]; }
-  }
-
-  // Tests for local memory usage
-  auto local_mem_usage = kernel.LocalMemUsage(device_);
-  if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
-
-  // Launches the kernel (and checks for launch errors)
-  try {
-    kernel.Launch(queue_, global, local, event, waitForEvents);
-  } catch (...) { return StatusCode::kKernelLaunchError; }
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
-}
-
-// As above, but without an event waiting list
-template <typename T>
-StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                                 const std::vector<size_t> &local, EventPointer event) {
-  auto emptyWaitingList = std::vector<Event>();
-  return RunKernel(kernel, global, local, event, emptyWaitingList);
-}
-
-// =================================================================================================
-
-// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
-  try {
-    auto required_size = (ld*(two-1) + one + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
-  try {
-    auto required_size = (ld*(two-1) + one + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
-  } catch (...) { return StatusCode::kInvalidMatrixB; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
-  try {
-    auto required_size = (ld*(two-1) + one + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
-  } catch (...) { return StatusCode::kInvalidMatrixC; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
-template <typename T>
-StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
-                                    const size_t offset, const size_t data_size) {
-  try {
-    auto required_size = (((n*(n+1))/2) + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                   const size_t inc, const size_t data_size) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
-  try {
-    auto required_size = ((n-1)*inc + 1 + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
-  } catch (...) { return StatusCode::kInvalidVectorX; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                   const size_t inc, const size_t data_size) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
-  try {
-    auto required_size = ((n-1)*inc + 1 + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
-  } catch (...) { return StatusCode::kInvalidVectorY; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector dot for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                     const size_t data_size) {
-  try {
-    auto required_size = (n + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
-  } catch (...) { return StatusCode::kInvalidVectorDot; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
-                                       const size_t offset, const size_t data_size) {
-  try {
-    auto required_size = (n + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
-  } catch (...) { return StatusCode::kInvalidVectorDot; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Copies or transposes a matrix and pads/unpads it with zeros
-template <typename T>
-StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
-                                              const size_t src_one, const size_t src_two,
-                                              const size_t src_ld, const size_t src_offset,
-                                              const Buffer<T> &src,
-                                              const size_t dest_one, const size_t dest_two,
-                                              const size_t dest_ld, const size_t dest_offset,
-                                              const Buffer<T> &dest,
-                                              const Program &program, const bool do_pad,
-                                              const bool do_transpose, const bool do_conjugate,
-                                              const bool upper, const bool lower,
-                                              const bool diagonal_imag_zero) {
-
-  // Determines whether or not the fast-version could potentially be used
-  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
-                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
-
-  // Determines the right kernel
-  auto kernel_name = std::string{};
-  if (do_transpose) {
-    if (use_fast_kernel &&
-        IsMultiple(src_ld, db_["TRA_WPT"]) &&
-        IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
-        IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
-      kernel_name = "TransposeMatrix";
-    }
-    else {
-      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
-    }
-  }
-  else {
-    if (use_fast_kernel &&
-        IsMultiple(src_ld, db_["COPY_VW"]) &&
-        IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
-        IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
-      kernel_name = "CopyMatrix";
-    }
-    else {
-      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
-    }
-  }
-
-  // Retrieves the kernel from the compiled binary
-  try {
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(src_ld));
-      kernel.SetArgument(1, src());
-      kernel.SetArgument(2, dest());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(src_one));
-      kernel.SetArgument(1, static_cast<int>(src_two));
-      kernel.SetArgument(2, static_cast<int>(src_ld));
-      kernel.SetArgument(3, static_cast<int>(src_offset));
-      kernel.SetArgument(4, src());
-      kernel.SetArgument(5, static_cast<int>(dest_one));
-      kernel.SetArgument(6, static_cast<int>(dest_two));
-      kernel.SetArgument(7, static_cast<int>(dest_ld));
-      kernel.SetArgument(8, static_cast<int>(dest_offset));
-      kernel.SetArgument(9, dest());
-      if (do_pad) {
-        kernel.SetArgument(10, static_cast<int>(do_conjugate));
-      }
-      else {
-        kernel.SetArgument(10, static_cast<int>(upper));
-        kernel.SetArgument(11, static_cast<int>(lower));
-        kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
-      }
-    }
-
-    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
-    // parameters in the database.
-    auto status = StatusCode::kSuccess;
-    if (do_transpose) {
-      if (use_fast_kernel) {
-        auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
-                                          dest_two / db_["TRA_WPT"]};
-        auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-      else {
-        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
-                                          Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
-        auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-    }
-    else {
-      if (use_fast_kernel) {
-        auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
-                                          dest_two / db_["COPY_WPT"]};
-        auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-      else {
-        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                          Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-        auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-    }
-    return status;
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Routine<float>;
-template class Routine<double>;
-template class Routine<float2>;
-template class Routine<double2>;
-
-// =================================================================================================
-} // namespace clblast
diff --git a/src/routine.cpp b/src/routine.cpp
new file mode 100644
index 00000000..d3590896
--- /dev/null
+++ b/src/routine.cpp
@@ -0,0 +1,131 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Routine base class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: not much here, because no status codes can be returned
+Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
+                 const std::vector<std::string> &routines, const Precision precision):
+    precision_(precision),
+    routine_name_(name),
+    queue_(queue),
+    event_(event),
+    context_(queue_.GetContext()),
+    device_(queue_.GetDevice()),
+    device_name_(device_.Name()),
+    db_(queue_, routines, precision_) {
+}
+
+// =================================================================================================
+
+// Separate set-up function to allow for status codes to be returned
+StatusCode Routine::SetUp() {
+
+  // Queries the cache to see whether or not the program (context-specific) is already there
+  if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
+
+  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
+  // is, a program is created and stored in the cache
+  if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
+    try {
+      auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
+      auto program = Program(device_, context_, binary);
+      auto options = std::vector<std::string>();
+      program.Build(device_, options);
+      StoreProgramToCache(program, context_, precision_, routine_name_);
+    } catch (...) { return StatusCode::kBuildProgramFailure; }
+    return StatusCode::kSuccess;
+  }
+
+  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
+  // program will be added to the cache.
+
+  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
+  const auto extensions = device_.Capabilities();
+  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
+    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
+      return StatusCode::kNoDoublePrecision;
+    }
+  }
+
+  // As above, but for cl_khr_fp16 (half precision)
+  if (precision_ == Precision::kHalf) {
+    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
+      return StatusCode::kNoHalfPrecision;
+    }
+  }
+
+  // Loads the common header (typedefs and defines and such)
+  std::string common_header =
+    #include "kernels/common.opencl"
+  ;
+
+  // Collects the parameters for this device in the form of defines, and adds the precision
+  auto defines = db_.GetDefines();
+  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
+
+  // Adds the name of the routine as a define
+  defines += "#define ROUTINE_"+routine_name_+"\n";
+
+  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+  // performance, but might result in a reduced accuracy.
+  if (device_.IsAMD() && device_.IsGPU()) {
+    defines += "#define USE_CL_MAD 1\n";
+  }
+
+  // For specific devices, use staggered/shuffled workgroup indices.
+  if (device_.IsAMD() && device_.IsGPU()) {
+    defines += "#define USE_STAGGERED_INDICES 1\n";
+  }
+
+  // For specific devices add a global synchronisation barrier to the GEMM kernel to optimize
+  // performance through better cache behaviour
+  if (device_.IsARM() && device_.IsGPU()) {
+    defines += "#define GLOBAL_MEM_FENCE 1\n";
+  }
+
+  // Combines everything together into a single source string
+  const auto source_string = defines + common_header + source_string_;
+
+  // Compiles the kernel
+  try {
+    auto program = Program(context_, source_string);
+    auto options = std::vector<std::string>();
+    const auto build_status = program.Build(device_, options);
+
+    // Checks for compiler crashes/errors/warnings
+    if (build_status == BuildStatus::kError) {
+      const auto message = program.GetBuildInfo(device_);
+      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
+      return StatusCode::kBuildProgramFailure;
+    }
+    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
+
+    // Store the compiled binary and program in the cache
+    const auto binary = program.GetIR();
+    StoreBinaryToCache(binary, device_name_, precision_, routine_name_);
+    StoreProgramToCache(program, context_, precision_, routine_name_);
+  } catch (...) { return StatusCode::kBuildProgramFailure; }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routine.hpp b/src/routine.hpp
new file mode 100644
index 00000000..54b5779f
--- /dev/null
+++ b/src/routine.hpp
@@ -0,0 +1,68 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements all the basic functionality for the BLAS routines. This class serves as a
+// base class for the actual routines (e.g. Xaxpy, Xgemm). It contains common functionality such as
+// compiling the OpenCL kernel, connecting to the database, etc.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINE_H_
+#define CLBLAST_ROUTINE_H_
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "cache.hpp"
+#include "buffer_test.hpp"
+#include "database/database.hpp"
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+class Routine {
+ public:
+
+  // Base class constructor
+  explicit Routine(Queue &queue, EventPointer event, const std::string &name,
+                   const std::vector<std::string> &routines, const Precision precision);
+
+  // Set-up phase of the kernel
+  StatusCode SetUp();
+
+ protected:
+
+  // Non-static variable for the precision
+  const Precision precision_;
+
+  // The routine's name and its kernel-source in string form
+  const std::string routine_name_;
+  std::string source_string_;
+
+  // The OpenCL objects, accessible only from derived classes
+  Queue queue_;
+  EventPointer event_;
+  const Context context_;
+  const Device device_;
+
+  // OpenCL device properties
+  const std::string device_name_;
+
+  // Connection to the database for all the device-specific parameters
+  const Database db_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINE_H_
+#endif
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
new file mode 100644
index 00000000..c378df28
--- /dev/null
+++ b/src/routines/common.cpp
@@ -0,0 +1,65 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the common routine functions (see the header for more information).
+//
+// =================================================================================================
+
+#include <vector>
+
+#include "routines/common.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event, std::vector<Event>& waitForEvents) {
+
+  // Tests for validity of the local thread sizes
+  if (local.size() > device.MaxWorkItemDimensions()) {
+    return StatusCode::kInvalidLocalNumDimensions; 
+  }
+  const auto max_work_item_sizes = device.MaxWorkItemSizes();
+  for (auto i=size_t{0}; i<local.size(); ++i) {
+    if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+  }
+  auto local_size = size_t{1};
+  for (auto &item: local) { local_size *= item; }
+  if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+
+  // Make sure the global thread sizes are at least equal to the local sizes
+  for (auto i=size_t{0}; i<global.size(); ++i) {
+    if (global[i] < local[i]) { global[i] = local[i]; }
+  }
+
+  // Tests for local memory usage
+  const auto local_mem_usage = kernel.LocalMemUsage(device);
+  if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
+
+  // Launches the kernel (and checks for launch errors)
+  try {
+    kernel.Launch(queue, global, local, event, waitForEvents);
+  } catch (...) { return StatusCode::kKernelLaunchError; }
+
+  // No errors, normal termination of this function
+  return StatusCode::kSuccess;
+}
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event) {
+  auto emptyWaitingList = std::vector<Event>();
+  return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
+}
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
new file mode 100644
index 00000000..c99cd39d
--- /dev/null
+++ b/src/routines/common.hpp
@@ -0,0 +1,173 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the interfaces to common kernels, such as copying, padding, and
+// transposing a matrix. These functions are templated and thus header-only. This file also contains
+// other common functions to routines, such as a function to launch a kernel.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_COMMON_H_
+#define CLBLAST_ROUTINES_COMMON_H_
+
+#include <string>
+#include <vector>
+
+#include "clblast.h"
+#include "clpp11.hpp"
+#include "database/database.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Enqueues a kernel, waits for completion, and checks for errors
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event, std::vector<Event>& waitForEvents);
+
+// As above, but without an event waiting list
+StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
+                     std::vector<size_t> global, const std::vector<size_t> &local,
+                     EventPointer event);
+
+// =================================================================================================
+
+// Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
+// to write to symmetric and triangular matrices through optional arguments.
+template <typename T>
+StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
+                                  const Database &db,
+                                  EventPointer event, std::vector<Event>& waitForEvents,
+                                  const size_t src_one, const size_t src_two,
+                                  const size_t src_ld, const size_t src_offset,
+                                  const Buffer<T> &src,
+                                  const size_t dest_one, const size_t dest_two,
+                                  const size_t dest_ld, const size_t dest_offset,
+                                  const Buffer<T> &dest,
+                                  const T alpha,
+                                  const Program &program, const bool do_pad,
+                                  const bool do_transpose, const bool do_conjugate,
+                                  const bool upper = false, const bool lower = false,
+                                  const bool diagonal_imag_zero = false) {
+
+  // Determines whether or not the fast-version could potentially be used
+  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
+                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
+
+  // Determines the right kernel
+  auto kernel_name = std::string{};
+  if (do_transpose) {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db["TRA_WPT"]) &&
+        IsMultiple(src_one, db["TRA_WPT"]*db["TRA_WPT"]) &&
+        IsMultiple(src_two, db["TRA_WPT"]*db["TRA_WPT"])) {
+      kernel_name = "TransposeMatrixFast";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (do_pad) ? "TransposePadMatrix" : "TransposeMatrix";
+    }
+  }
+  else {
+    if (use_fast_kernel &&
+        IsMultiple(src_ld, db["COPY_VW"]) &&
+        IsMultiple(src_one, db["COPY_VW"]*db["COPY_DIMX"]) &&
+        IsMultiple(src_two, db["COPY_WPT"]*db["COPY_DIMY"])) {
+      kernel_name = "CopyMatrixFast";
+    }
+    else {
+      use_fast_kernel = false;
+      kernel_name = (do_pad) ? "CopyPadMatrix" : "CopyMatrix";
+    }
+  }
+
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context, 1);
+  alpha_buffer.Write(queue, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
+  try {
+    auto kernel = Kernel(program, kernel_name);
+
+    // Sets the kernel arguments
+    if (use_fast_kernel) {
+      kernel.SetArgument(0, static_cast<int>(src_ld));
+      kernel.SetArgument(1, src());
+      kernel.SetArgument(2, dest());
+      kernel.SetArgument(3, alpha_buffer());
+    }
+    else {
+      kernel.SetArgument(0, static_cast<int>(src_one));
+      kernel.SetArgument(1, static_cast<int>(src_two));
+      kernel.SetArgument(2, static_cast<int>(src_ld));
+      kernel.SetArgument(3, static_cast<int>(src_offset));
+      kernel.SetArgument(4, src());
+      kernel.SetArgument(5, static_cast<int>(dest_one));
+      kernel.SetArgument(6, static_cast<int>(dest_two));
+      kernel.SetArgument(7, static_cast<int>(dest_ld));
+      kernel.SetArgument(8, static_cast<int>(dest_offset));
+      kernel.SetArgument(9, dest());
+      kernel.SetArgument(10, alpha_buffer());
+      if (do_pad) {
+        kernel.SetArgument(11, static_cast<int>(do_conjugate));
+      }
+      else {
+        kernel.SetArgument(11, static_cast<int>(upper));
+        kernel.SetArgument(12, static_cast<int>(lower));
+        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
+      }
+    }
+
+    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
+    // parameters in the database.
+    if (do_transpose) {
+      if (use_fast_kernel) {
+        const auto global = std::vector<size_t>{
+          dest_one / db["TRA_WPT"],
+          dest_two / db["TRA_WPT"]
+        };
+        const auto local = std::vector<size_t>{db["TRA_DIM"], db["TRA_DIM"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+      else {
+        const auto global = std::vector<size_t>{
+          Ceil(CeilDiv(dest_one, db["PADTRA_WPT"]), db["PADTRA_TILE"]),
+          Ceil(CeilDiv(dest_two, db["PADTRA_WPT"]), db["PADTRA_TILE"])
+        };
+        const auto local = std::vector<size_t>{db["PADTRA_TILE"], db["PADTRA_TILE"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+    }
+    else {
+      if (use_fast_kernel) {
+        const auto global = std::vector<size_t>{
+          dest_one / db["COPY_VW"],
+          dest_two / db["COPY_WPT"]
+        };
+        const auto local = std::vector<size_t>{db["COPY_DIMX"], db["COPY_DIMY"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+      else {
+        const auto global = std::vector<size_t>{
+          Ceil(CeilDiv(dest_one, db["PAD_WPTX"]), db["PAD_DIMX"]),
+          Ceil(CeilDiv(dest_two, db["PAD_WPTY"]), db["PAD_DIMY"])
+        };
+        const auto local = std::vector<size_t>{db["PAD_DIMX"], db["PAD_DIMY"]};
+        return RunKernel(kernel, queue, device, global, local, event, waitForEvents);
+      }
+    }
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_COMMON_H_
+#endif
diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cpp
index 682e2b63..6b6e7f9e 100644
--- a/src/routines/level1/xamax.cc
+++ b/src/routines/level1/xamax.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xamax.h"
+#include "routines/level1/xamax.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xamax<float>::precision_ = Precision::kSingle;
-template <> const Precision Xamax<double>::precision_ = Precision::kDouble;
-template <> const Precision Xamax<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xamax<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xamax<T>::Xamax(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/xamax.opencl"
   ;
@@ -48,14 +40,14 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorIndex(1, imax_buffer, imax_offset, sizeof(unsigned int));
+  status = TestVectorIndex(1, imax_buffer, imax_offset);
   if (ErrorIn(status)) { return status; }
 
   // Retrieves the Xamax kernels from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel1 = Kernel(program, "Xamax");
     auto kernel2 = Kernel(program, "XamaxEpilogue");
 
@@ -79,7 +71,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
     auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
     auto local1 = std::vector<size_t>{db_["WGS1"]};
     auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(kernelEvent);
 
@@ -92,7 +84,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
     // Launches the epilogue kernel
     auto global2 = std::vector<size_t>{db_["WGS2"]};
     auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -103,6 +95,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xamax<half>;
 template class Xamax<float>;
 template class Xamax<double>;
 template class Xamax<float2>;
diff --git a/include/internal/routines/level1/xamax.h b/src/routines/level1/xamax.hpp
index c318115e..aa45a8e4 100644
--- a/include/internal/routines/level1/xamax.h
+++ b/src/routines/level1/xamax.hpp
@@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XAMAX_H_
 #define CLBLAST_ROUTINES_XAMAX_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xamax: public Routine<T> {
+class Xamax: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorIndex;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xamax(Queue &queue, EventPointer event, const std::string &name = "AMAX");
 
@@ -43,10 +31,6 @@ class Xamax: public Routine<T> {
   StatusCode DoAmax(const size_t n,
                     const Buffer<unsigned int> &imax_buffer, const size_t imax_offset,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cpp
index ea33d7e1..0c1ce903 100644
--- a/src/routines/level1/xasum.cc
+++ b/src/routines/level1/xasum.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xasum.h"
+#include "routines/level1/xasum.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xasum<float>::precision_ = Precision::kSingle;
-template <> const Precision Xasum<double>::precision_ = Precision::kDouble;
-template <> const Precision Xasum<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xasum<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xasum<T>::Xasum(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/xasum.opencl"
   ;
@@ -48,14 +40,14 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorDot(1, asum_buffer, asum_offset, sizeof(T));
+  status = TestVectorScalar(1, asum_buffer, asum_offset);
   if (ErrorIn(status)) { return status; }
 
   // Retrieves the Xasum kernels from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel1 = Kernel(program, "Xasum");
     auto kernel2 = Kernel(program, "XasumEpilogue");
 
@@ -77,7 +69,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
     auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
     auto local1 = std::vector<size_t>{db_["WGS1"]};
     auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(kernelEvent);
 
@@ -89,7 +81,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
     // Launches the epilogue kernel
     auto global2 = std::vector<size_t>{db_["WGS2"]};
     auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -100,6 +92,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xasum<half>;
 template class Xasum<float>;
 template class Xasum<double>;
 template class Xasum<float2>;
diff --git a/include/internal/routines/level1/xasum.h b/src/routines/level1/xasum.hpp
index b6e5d2cd..5a253f4d 100644
--- a/include/internal/routines/level1/xasum.h
+++ b/src/routines/level1/xasum.hpp
@@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XASUM_H_
 #define CLBLAST_ROUTINES_XASUM_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xasum: public Routine<T> {
+class Xasum: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorDot;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xasum(Queue &queue, EventPointer event, const std::string &name = "ASUM");
 
@@ -43,10 +31,6 @@ class Xasum: public Routine<T> {
   StatusCode DoAsum(const size_t n,
                     const Buffer<T> &asum_buffer, const size_t asum_offset,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cpp
index 96809a57..5b6c9e77 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xaxpy.h"
+#include "routines/level1/xaxpy.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xaxpy<float>::precision_ = Precision::kSingle;
-template <> const Precision Xaxpy<double>::precision_ = Precision::kDouble;
-template <> const Precision Xaxpy<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xaxpy<T>::Xaxpy(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/level1.opencl"
     #include "../../kernels/level1/xaxpy.opencl"
@@ -49,9 +41,9 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
@@ -64,19 +56,23 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
 
   // Retrieves the Xaxpy kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
+    // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+    auto alpha_buffer = Buffer<T>(context_, 1);
+    alpha_buffer.Write(queue_, 1, &alpha);
+
     // Sets the kernel arguments
     if (use_fast_kernel) {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(1, alpha_buffer());
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, y_buffer());
     }
     else {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha);
+      kernel.SetArgument(1, alpha_buffer());
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, static_cast<int>(x_offset));
       kernel.SetArgument(4, static_cast<int>(x_inc));
@@ -89,13 +85,13 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
     if (use_fast_kernel) {
       auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     else {
       auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
       auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     if (ErrorIn(status)) { return status; }
 
@@ -107,6 +103,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xaxpy<half>;
 template class Xaxpy<float>;
 template class Xaxpy<double>;
 template class Xaxpy<float2>;
diff --git a/include/internal/routines/level1/xaxpy.h b/src/routines/level1/xaxpy.hpp
index bc00c8e3..caac871e 100644
--- a/include/internal/routines/level1/xaxpy.h
+++ b/src/routines/level1/xaxpy.hpp
@@ -14,27 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XAXPY_H_
 #define CLBLAST_ROUTINES_XAXPY_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xaxpy: public Routine<T> {
+class Xaxpy: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xaxpy(Queue &queue, EventPointer event, const std::string &name = "AXPY");
 
@@ -42,10 +31,6 @@ class Xaxpy: public Routine<T> {
   StatusCode DoAxpy(const size_t n, const T alpha,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                     const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cpp
index d34482ce..673ef349 100644
--- a/src/routines/level1/xcopy.cc
+++ b/src/routines/level1/xcopy.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xcopy.h"
+#include "routines/level1/xcopy.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xcopy<float>::precision_ = Precision::kSingle;
-template <> const Precision Xcopy<double>::precision_ = Precision::kDouble;
-template <> const Precision Xcopy<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xcopy<T>::Xcopy(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/level1.opencl"
     #include "../../kernels/level1/xcopy.opencl"
@@ -49,9 +41,9 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
@@ -64,7 +56,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
 
   // Retrieves the Xcopy kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
     // Sets the kernel arguments
@@ -87,13 +79,13 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
     if (use_fast_kernel) {
       auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     else {
       auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
       auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     if (ErrorIn(status)) { return status; }
 
@@ -105,6 +97,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xcopy<half>;
 template class Xcopy<float>;
 template class Xcopy<double>;
 template class Xcopy<float2>;
diff --git a/include/internal/routines/level1/xcopy.h b/src/routines/level1/xcopy.hpp
index 5786cb0f..0c424ba3 100644
--- a/include/internal/routines/level1/xcopy.h
+++ b/src/routines/level1/xcopy.hpp
@@ -14,27 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XCOPY_H_
 #define CLBLAST_ROUTINES_XCOPY_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xcopy: public Routine<T> {
+class Xcopy: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xcopy(Queue &queue, EventPointer event, const std::string &name = "COPY");
 
@@ -42,10 +31,6 @@ class Xcopy: public Routine<T> {
   StatusCode DoCopy(const size_t n,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                     const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cpp
index b2513485..bafea157 100644
--- a/src/routines/level1/xdot.cc
+++ b/src/routines/level1/xdot.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xdot<float>::precision_ = Precision::kSingle;
-template <> const Precision Xdot<double>::precision_ = Precision::kDouble;
-template <> const Precision Xdot<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xdot<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xdot<T>::Xdot(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/xdot.opencl"
   ;
@@ -50,16 +42,16 @@ StatusCode Xdot<T>::DoDot(const size_t n,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorDot(1, dot_buffer, dot_offset, sizeof(T));
+  status = TestVectorScalar(1, dot_buffer, dot_offset);
   if (ErrorIn(status)) { return status; }
 
   // Retrieves the Xdot kernels from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel1 = Kernel(program, "Xdot");
     auto kernel2 = Kernel(program, "XdotEpilogue");
 
@@ -85,7 +77,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
     auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
     auto local1 = std::vector<size_t>{db_["WGS1"]};
     auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(kernelEvent);
 
@@ -97,7 +89,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
     // Launches the epilogue kernel
     auto global2 = std::vector<size_t>{db_["WGS2"]};
     auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -108,6 +100,7 @@ StatusCode Xdot<T>::DoDot(const size_t n,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xdot<half>;
 template class Xdot<float>;
 template class Xdot<double>;
 template class Xdot<float2>;
diff --git a/include/internal/routines/level1/xdot.h b/src/routines/level1/xdot.hpp
index 95a7ad07..02c1efaa 100644
--- a/include/internal/routines/level1/xdot.h
+++ b/src/routines/level1/xdot.hpp
@@ -14,29 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XDOT_H_
 #define CLBLAST_ROUTINES_XDOT_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xdot: public Routine<T> {
+class Xdot: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestVectorDot;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xdot(Queue &queue, EventPointer event, const std::string &name = "DOT");
 
@@ -46,10 +33,6 @@ class Xdot: public Routine<T> {
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const bool do_conjugate = false);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level1/xdotc.cc b/src/routines/level1/xdotc.cpp
index b3a01079..27cf2bab 100644
--- a/src/routines/level1/xdotc.cc
+++ b/src/routines/level1/xdotc.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xdotc.h"
+#include "routines/level1/xdotc.hpp"
 
 #include <string>
 #include <vector>
diff --git a/include/internal/routines/level1/xdotc.h b/src/routines/level1/xdotc.hpp
index 0dc2cfe9..b8cbdaf5 100644
--- a/include/internal/routines/level1/xdotc.h
+++ b/src/routines/level1/xdotc.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XDOTC_H_
 #define CLBLAST_ROUTINES_XDOTC_H_
 
-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level1/xdotu.cc b/src/routines/level1/xdotu.cpp
index 8dded6e0..0bce70b7 100644
--- a/src/routines/level1/xdotu.cc
+++ b/src/routines/level1/xdotu.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xdotu.h"
+#include "routines/level1/xdotu.hpp"
 
 #include <string>
 
diff --git a/include/internal/routines/level1/xdotu.h b/src/routines/level1/xdotu.hpp
index 98988744..b3f73086 100644
--- a/include/internal/routines/level1/xdotu.h
+++ b/src/routines/level1/xdotu.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XDOTU_H_
 #define CLBLAST_ROUTINES_XDOTU_H_
 
-#include "internal/routines/level1/xdot.h"
+#include "routines/level1/xdot.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/include/internal/routines/level1/xmax.h b/src/routines/level1/xmax.hpp
index a872cede..5a0236f2 100644
--- a/include/internal/routines/level1/xmax.h
+++ b/src/routines/level1/xmax.hpp
@@ -14,8 +14,8 @@
 #ifndef CLBLAST_ROUTINES_XMAX_H_
 #define CLBLAST_ROUTINES_XMAX_H_
 
-#include "internal/routine.h"
-#include "internal/routines/level1/xamax.h"
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/include/internal/routines/level1/xmin.h b/src/routines/level1/xmin.hpp
index 700c81cc..6befec64 100644
--- a/include/internal/routines/level1/xmin.h
+++ b/src/routines/level1/xmin.hpp
@@ -14,8 +14,8 @@
 #ifndef CLBLAST_ROUTINES_XMIN_H_
 #define CLBLAST_ROUTINES_XMIN_H_
 
-#include "internal/routine.h"
-#include "internal/routines/level1/xamax.h"
+#include "routine.hpp"
+#include "routines/level1/xamax.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cpp
index 86166a0c..97615d8b 100644
--- a/src/routines/level1/xnrm2.cc
+++ b/src/routines/level1/xnrm2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xnrm2.h"
+#include "routines/level1/xnrm2.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xnrm2<float>::precision_ = Precision::kSingle;
-template <> const Precision Xnrm2<double>::precision_ = Precision::kDouble;
-template <> const Precision Xnrm2<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xnrm2<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xnrm2<T>::Xnrm2(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xdot"}, precision_) {
+    Routine(queue, event, name, {"Xdot"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/xnrm2.opencl"
   ;
@@ -48,14 +40,14 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorDot(1, nrm2_buffer, nrm2_offset, sizeof(T));
+  status = TestVectorScalar(1, nrm2_buffer, nrm2_offset);
   if (ErrorIn(status)) { return status; }
 
   // Retrieves the Xnrm2 kernels from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel1 = Kernel(program, "Xnrm2");
     auto kernel2 = Kernel(program, "Xnrm2Epilogue");
 
@@ -77,7 +69,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
     auto global1 = std::vector<size_t>{db_["WGS1"]*temp_size};
     auto local1 = std::vector<size_t>{db_["WGS1"]};
     auto kernelEvent = Event();
-    status = RunKernel(kernel1, global1, local1, kernelEvent.pointer());
+    status = RunKernel(kernel1, queue_, device_, global1, local1, kernelEvent.pointer());
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(kernelEvent);
 
@@ -89,7 +81,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
     // Launches the epilogue kernel
     auto global2 = std::vector<size_t>{db_["WGS2"]};
     auto local2 = std::vector<size_t>{db_["WGS2"]};
-    status = RunKernel(kernel2, global2, local2, event_, eventWaitList);
+    status = RunKernel(kernel2, queue_, device_, global2, local2, event_, eventWaitList);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -100,6 +92,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xnrm2<half>;
 template class Xnrm2<float>;
 template class Xnrm2<double>;
 template class Xnrm2<float2>;
diff --git a/include/internal/routines/level1/xnrm2.h b/src/routines/level1/xnrm2.hpp
index 6f6ca74f..7baf07f5 100644
--- a/include/internal/routines/level1/xnrm2.h
+++ b/src/routines/level1/xnrm2.hpp
@@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XNRM2_H_
 #define CLBLAST_ROUTINES_XNRM2_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xnrm2: public Routine<T> {
+class Xnrm2: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorDot;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xnrm2(Queue &queue, EventPointer event, const std::string &name = "NRM2");
 
@@ -43,10 +31,6 @@ class Xnrm2: public Routine<T> {
   StatusCode DoNrm2(const size_t n,
                     const Buffer<T> &nrm2_buffer, const size_t nrm2_offset,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cpp
index b92e2cdf..bcc43c3b 100644
--- a/src/routines/level1/xscal.cc
+++ b/src/routines/level1/xscal.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xscal.h"
+#include "routines/level1/xscal.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xscal<float>::precision_ = Precision::kSingle;
-template <> const Precision Xscal<double>::precision_ = Precision::kDouble;
-template <> const Precision Xscal<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xscal<T>::Xscal(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/level1.opencl"
     #include "../../kernels/level1/xscal.opencl"
@@ -48,7 +40,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vector for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
@@ -60,7 +52,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
 
   // Retrieves the Xscal kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
     // Sets the kernel arguments
@@ -81,13 +73,13 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
     if (use_fast_kernel) {
       auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     else {
       auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
       auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     if (ErrorIn(status)) { return status; }
 
@@ -99,6 +91,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xscal<half>;
 template class Xscal<float>;
 template class Xscal<double>;
 template class Xscal<float2>;
diff --git a/include/internal/routines/level1/xscal.h b/src/routines/level1/xscal.hpp
index e10a201d..6c585cb2 100644
--- a/include/internal/routines/level1/xscal.h
+++ b/src/routines/level1/xscal.hpp
@@ -14,36 +14,22 @@
 #ifndef CLBLAST_ROUTINES_XSCAL_H_
 #define CLBLAST_ROUTINES_XSCAL_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xscal: public Routine<T> {
+class Xscal: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xscal(Queue &queue, EventPointer event, const std::string &name = "SCAL");
 
   // Templated-precision implementation of the routine
   StatusCode DoScal(const size_t n, const T alpha,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/include/internal/routines/level1/xsum.h b/src/routines/level1/xsum.hpp
index 2f633b52..84e20bea 100644
--- a/include/internal/routines/level1/xsum.h
+++ b/src/routines/level1/xsum.hpp
@@ -14,8 +14,8 @@
 #ifndef CLBLAST_ROUTINES_XSUM_H_
 #define CLBLAST_ROUTINES_XSUM_H_
 
-#include "internal/routine.h"
-#include "internal/routines/level1/xasum.h"
+#include "routine.hpp"
+#include "routines/level1/xasum.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cpp
index bfc4a739..03907cbd 100644
--- a/src/routines/level1/xswap.cc
+++ b/src/routines/level1/xswap.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level1/xswap.h"
+#include "routines/level1/xswap.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xswap<float>::precision_ = Precision::kSingle;
-template <> const Precision Xswap<double>::precision_ = Precision::kDouble;
-template <> const Precision Xswap<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xswap<T>::Xswap(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xaxpy"}, precision_) {
+    Routine(queue, event, name, {"Xaxpy"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level1/level1.opencl"
     #include "../../kernels/level1/xswap.opencl"
@@ -49,9 +41,9 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
   if (n == 0) { return StatusCode::kInvalidDimension; }
 
   // Tests the vectors for validity
-  auto status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  auto status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
@@ -64,7 +56,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
 
   // Retrieves the Xswap kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
     // Sets the kernel arguments
@@ -87,13 +79,13 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
     if (use_fast_kernel) {
       auto global = std::vector<size_t>{CeilDiv(n, db_["WPT"]*db_["VW"])};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     else {
       auto n_ceiled = Ceil(n, db_["WGS"]*db_["WPT"]);
       auto global = std::vector<size_t>{n_ceiled/db_["WPT"]};
       auto local = std::vector<size_t>{db_["WGS"]};
-      status = RunKernel(kernel, global, local, event_);
+      status = RunKernel(kernel, queue_, device_, global, local, event_);
     }
     if (ErrorIn(status)) { return status; }
 
@@ -105,6 +97,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xswap<half>;
 template class Xswap<float>;
 template class Xswap<double>;
 template class Xswap<float2>;
diff --git a/include/internal/routines/level1/xswap.h b/src/routines/level1/xswap.hpp
index 0f240763..4f9ea36d 100644
--- a/include/internal/routines/level1/xswap.h
+++ b/src/routines/level1/xswap.hpp
@@ -14,27 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XSWAP_H_
 #define CLBLAST_ROUTINES_XSWAP_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xswap: public Routine<T> {
+class Xswap: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xswap(Queue &queue, EventPointer event, const std::string &name = "SWAP");
 
@@ -42,10 +31,6 @@ class Xswap: public Routine<T> {
   StatusCode DoSwap(const size_t n,
                     const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                     const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level2/xgbmv.cc b/src/routines/level2/xgbmv.cpp
index f90e26b2..ea4f001c 100644
--- a/src/routines/level2/xgbmv.cc
+++ b/src/routines/level2/xgbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgbmv.h"
+#include "routines/level2/xgbmv.hpp"
 
 #include <string>
 #include <vector>
@@ -58,6 +58,7 @@ StatusCode Xgbmv<T>::DoGbmv(const Layout layout, const Transpose a_transpose,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xgbmv<half>;
 template class Xgbmv<float>;
 template class Xgbmv<double>;
 template class Xgbmv<float2>;
diff --git a/include/internal/routines/level2/xgbmv.h b/src/routines/level2/xgbmv.hpp
index bc94c77d..686ab642 100644
--- a/include/internal/routines/level2/xgbmv.h
+++ b/src/routines/level2/xgbmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XGBMV_H_
 #define CLBLAST_ROUTINES_XGBMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cpp
index f8985038..21fb397c 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemv<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemv<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemv<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Pad", "Xgemv"}, precision_) {
+    Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
@@ -100,12 +92,12 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
 
   // Tests the matrix and the vectors for validity
   auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld); }
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n_real, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n_real, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(m_real, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(m_real, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
   // Determines whether or not the fast-version can be used
@@ -134,16 +126,22 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     local_size = db_["WGS3"];
   }
 
+  // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  auto beta_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+  beta_buffer.Write(queue_, 1, &beta);
+
   // Retrieves the Xgemv kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(m_real));
     kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha);
-    kernel.SetArgument(3, beta);
+    kernel.SetArgument(2, alpha_buffer());
+    kernel.SetArgument(3, beta_buffer());
     kernel.SetArgument(4, static_cast<int>(a_rotated));
     kernel.SetArgument(5, a_buffer());
     kernel.SetArgument(6, static_cast<int>(a_offset));
@@ -162,7 +160,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     // Launches the kernel
     auto global = std::vector<size_t>{global_size};
     auto local = std::vector<size_t>{local_size};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -173,6 +171,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xgemv<half>;
 template class Xgemv<float>;
 template class Xgemv<double>;
 template class Xgemv<float2>;
diff --git a/include/internal/routines/level2/xgemv.h b/src/routines/level2/xgemv.hpp
index 0b2a8e66..e9afec8d 100644
--- a/include/internal/routines/level2/xgemv.h
+++ b/src/routines/level2/xgemv.hpp
@@ -14,29 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XGEMV_H_
 #define CLBLAST_ROUTINES_XGEMV_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xgemv: public Routine<T> {
+class Xgemv: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixAP;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xgemv(Queue &queue, EventPointer event, const std::string &name = "GEMV");
 
@@ -60,10 +47,6 @@ class Xgemv: public Routine<T> {
                     bool fast_kernel, bool fast_kernel_rot,
                     const size_t parameter, const bool packed,
                     const size_t kl, const size_t ku);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cpp
index 686c7e60..353047d2 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"
 
 #include <string>
 #include <vector>
@@ -19,18 +19,10 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xger<float>::precision_ = Precision::kSingle;
-template <> const Precision Xger<double>::precision_ = Precision::kDouble;
-template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xger<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xger<T>::Xger(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/level2.opencl"
     #include "../../kernels/level2/xger.opencl"
@@ -57,22 +49,26 @@ StatusCode Xger<T>::DoGer(const Layout layout,
   const auto a_two = (a_is_rowmajor) ? m : n;
 
   // Tests the matrix and the vectors for validity
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(m, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(m, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Retrieves the Xgemv kernel from the compiled binary
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, "Xger");
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(a_one));
     kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, alpha);
+    kernel.SetArgument(2, alpha_buffer());
     kernel.SetArgument(3, x_buffer());
     kernel.SetArgument(4, static_cast<int>(x_offset));
     kernel.SetArgument(5, static_cast<int>(x_inc));
@@ -89,7 +85,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
     auto a_two_ceiled = Ceil(CeilDiv(a_two, db_["WPT"]), db_["WGS2"]);
     auto global = std::vector<size_t>{a_one_ceiled, a_two_ceiled};
     auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -100,6 +96,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xger<half>;
 template class Xger<float>;
 template class Xger<double>;
 template class Xger<float2>;
diff --git a/include/internal/routines/level2/xger.h b/src/routines/level2/xger.hpp
index 5ace9da6..3c6abe44 100644
--- a/include/internal/routines/level2/xger.h
+++ b/src/routines/level2/xger.hpp
@@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XGER_H_
 #define CLBLAST_ROUTINES_XGER_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xger: public Routine<T> {
+class Xger: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xger(Queue &queue, EventPointer event, const std::string &name = "GER");
 
@@ -46,10 +34,6 @@ class Xger: public Routine<T> {
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level2/xgerc.cc b/src/routines/level2/xgerc.cpp
index 73284b52..d9feda97 100644
--- a/src/routines/level2/xgerc.cc
+++ b/src/routines/level2/xgerc.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgerc.h"
+#include "routines/level2/xgerc.hpp"
 
 #include <string>
 
diff --git a/include/internal/routines/level2/xgerc.h b/src/routines/level2/xgerc.hpp
index 6d06ef94..f1d04dfd 100644
--- a/include/internal/routines/level2/xgerc.h
+++ b/src/routines/level2/xgerc.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XGERC_H_
 #define CLBLAST_ROUTINES_XGERC_H_
 
-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xgeru.cc b/src/routines/level2/xgeru.cpp
index 7730d6a5..da9e91c2 100644
--- a/src/routines/level2/xgeru.cc
+++ b/src/routines/level2/xgeru.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xgeru.h"
+#include "routines/level2/xgeru.hpp"
 
 #include <string>
 
diff --git a/include/internal/routines/level2/xgeru.h b/src/routines/level2/xgeru.hpp
index 45ce1cba..fb50e917 100644
--- a/include/internal/routines/level2/xgeru.h
+++ b/src/routines/level2/xgeru.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XGERU_H_
 #define CLBLAST_ROUTINES_XGERU_H_
 
-#include "internal/routines/level2/xger.h"
+#include "routines/level2/xger.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xhbmv.cc b/src/routines/level2/xhbmv.cpp
index 58591b50..f6c0e3c4 100644
--- a/src/routines/level2/xhbmv.cc
+++ b/src/routines/level2/xhbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhbmv.h"
+#include "routines/level2/xhbmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/include/internal/routines/level2/xhbmv.h b/src/routines/level2/xhbmv.hpp
index f0a6212c..d668eb88 100644
--- a/include/internal/routines/level2/xhbmv.h
+++ b/src/routines/level2/xhbmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XHBMV_H_
 #define CLBLAST_ROUTINES_XHBMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xhemv.cc b/src/routines/level2/xhemv.cpp
index b4ef0fa4..2cbcf7b4 100644
--- a/src/routines/level2/xhemv.cc
+++ b/src/routines/level2/xhemv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhemv.h"
+#include "routines/level2/xhemv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/include/internal/routines/level2/xhemv.h b/src/routines/level2/xhemv.hpp
index 3daf2457..8e062fd3 100644
--- a/include/internal/routines/level2/xhemv.h
+++ b/src/routines/level2/xhemv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XHEMV_H_
 #define CLBLAST_ROUTINES_XHEMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cpp
index a7116213..ed8ba9e9 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cpp
@@ -11,25 +11,17 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
 
 #include <string>
 
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
-template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
-template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher<double2, double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher<T,U>::Xher(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/level2.opencl"
     #include "../../kernels/level2/xher.opencl"
@@ -43,6 +35,7 @@ template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return floa
 template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
 template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
 template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
+template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
 
 // =================================================================================================
 
@@ -63,28 +56,32 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
                          (triangle == Triangle::kLower && layout == Layout::kRowMajor));
   const auto is_rowmajor = (layout == Layout::kRowMajor);
 
-  // Creates a matching version of alpha
-  const auto matching_alpha = GetAlpha(alpha);
-
   // Tests the matrix and the vectors for validity
   auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
 
   // If alpha is zero an update is not required
   if (alpha == U{0}) { return StatusCode::kSuccess; }
 
-  // Retrieves the Xgemv kernel from the compiled binary
+  // Creates a matching version of alpha
+  const auto matching_alpha = GetAlpha(alpha);
+
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &matching_alpha);
+
+  // Retrieves the kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, "Xher");
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, matching_alpha);
+    kernel.SetArgument(1, alpha_buffer());
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
@@ -99,7 +96,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
     auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
     auto global = std::vector<size_t>{global_one, global_two};
     auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -110,6 +107,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xher<half, half>;
 template class Xher<float, float>;
 template class Xher<double, double>;
 template class Xher<float2, float>;
diff --git a/include/internal/routines/level2/xher.h b/src/routines/level2/xher.hpp
index 861ba302..9ff6bf3f 100644
--- a/include/internal/routines/level2/xher.h
+++ b/src/routines/level2/xher.hpp
@@ -14,28 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XHER_H_
 #define CLBLAST_ROUTINES_XHER_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T, typename U>
-class Xher: public Routine<T> {
+class Xher: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixAP;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xher(Queue &queue, EventPointer event, const std::string &name = "HER");
 
@@ -49,10 +37,6 @@ class Xher: public Routine<T> {
                    const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                    const bool packed = false);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cpp
index 3fd1a961..50572cea 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cpp
@@ -11,25 +11,17 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
 
 #include <string>
 
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
-template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
-template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher2<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xher2<T>::Xher2(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Xger"}, precision_) {
+    Routine(queue, event, name, {"Xger"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/level2.opencl"
     #include "../../kernels/level2/xher2.opencl"
@@ -58,22 +50,26 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 
   // Tests the matrix and the vectors for validity
   auto status = StatusCode::kSuccess;
-  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
-  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld, sizeof(T)); }
+  if (packed) { status = TestMatrixAP(n, a_buffer, a_offset); }
+  else { status = TestMatrixA(n, n, a_buffer, a_offset, a_ld); }
   if (ErrorIn(status)) { return status; }
-  status = TestVectorX(n, x_buffer, x_offset, x_inc, sizeof(T));
+  status = TestVectorX(n, x_buffer, x_offset, x_inc);
   if (ErrorIn(status)) { return status; }
-  status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
+  status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Retrieves the Xgemv kernel from the compiled binary
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+
+  // Retrieves the kernel from the compiled binary
   try {
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, "Xher2");
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha);
+    kernel.SetArgument(1, alpha_buffer());
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
@@ -91,7 +87,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
     auto global_two = Ceil(CeilDiv(n, db_["WPT"]), db_["WGS2"]);
     auto global = std::vector<size_t>{global_one, global_two};
     auto local = std::vector<size_t>{db_["WGS1"], db_["WGS2"]};
-    status = RunKernel(kernel, global, local, event_);
+    status = RunKernel(kernel, queue_, device_, global, local, event_);
     if (ErrorIn(status)) { return status; }
 
     // Succesfully finished the computation
@@ -102,6 +98,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xher2<half>;
 template class Xher2<float>;
 template class Xher2<double>;
 template class Xher2<float2>;
diff --git a/include/internal/routines/level2/xher2.h b/src/routines/level2/xher2.hpp
index 9a23199e..8c53c047 100644
--- a/include/internal/routines/level2/xher2.h
+++ b/src/routines/level2/xher2.hpp
@@ -14,29 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XHER2_H_
 #define CLBLAST_ROUTINES_XHER2_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xher2: public Routine<T> {
+class Xher2: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestVectorX;
-  using Routine<T>::TestVectorY;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixAP;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xher2(Queue &queue, EventPointer event, const std::string &name = "HER2");
 
@@ -48,10 +35,6 @@ class Xher2: public Routine<T> {
                     const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc,
                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                     const bool packed = false);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level2/xhpmv.cc b/src/routines/level2/xhpmv.cpp
index 92686dbe..e6f82b34 100644
--- a/src/routines/level2/xhpmv.cc
+++ b/src/routines/level2/xhpmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpmv.h"
+#include "routines/level2/xhpmv.hpp"
 
 #include <string>
 #include <vector>
diff --git a/include/internal/routines/level2/xhpmv.h b/src/routines/level2/xhpmv.hpp
index a1d5595a..b11192f9 100644
--- a/include/internal/routines/level2/xhpmv.h
+++ b/src/routines/level2/xhpmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XHPMV_H_
 #define CLBLAST_ROUTINES_XHPMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xhpr.cc b/src/routines/level2/xhpr.cpp
index 4b31ad09..225ebfe5 100644
--- a/src/routines/level2/xhpr.cc
+++ b/src/routines/level2/xhpr.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpr.h"
+#include "routines/level2/xhpr.hpp"
 
 #include <string>
 
diff --git a/include/internal/routines/level2/xhpr.h b/src/routines/level2/xhpr.hpp
index 6554d74c..37801c68 100644
--- a/include/internal/routines/level2/xhpr.h
+++ b/src/routines/level2/xhpr.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XHPR_H_
 #define CLBLAST_ROUTINES_XHPR_H_
 
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xhpr2.cc b/src/routines/level2/xhpr2.cpp
index 9be24f43..85f9d3f9 100644
--- a/src/routines/level2/xhpr2.cc
+++ b/src/routines/level2/xhpr2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xhpr2.h"
+#include "routines/level2/xhpr2.hpp"
 
 #include <string>
 
diff --git a/include/internal/routines/level2/xhpr2.h b/src/routines/level2/xhpr2.hpp
index d95e7b61..d66dce55 100644
--- a/include/internal/routines/level2/xhpr2.h
+++ b/src/routines/level2/xhpr2.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XHPR2_H_
 #define CLBLAST_ROUTINES_XHPR2_H_
 
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xsbmv.cc b/src/routines/level2/xsbmv.cpp
index bc82c88d..28730899 100644
--- a/src/routines/level2/xsbmv.cc
+++ b/src/routines/level2/xsbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsbmv.h"
+#include "routines/level2/xsbmv.hpp"
 
 #include <string>
 #include <vector>
@@ -57,6 +57,7 @@ StatusCode Xsbmv<T>::DoSbmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsbmv<half>;
 template class Xsbmv<float>;
 template class Xsbmv<double>;
 
diff --git a/include/internal/routines/level2/xsbmv.h b/src/routines/level2/xsbmv.hpp
index 4328e377..16c5e9a8 100644
--- a/include/internal/routines/level2/xsbmv.h
+++ b/src/routines/level2/xsbmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XSBMV_H_
 #define CLBLAST_ROUTINES_XSBMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xspmv.cc b/src/routines/level2/xspmv.cpp
index 6e00dcfa..f6651012 100644
--- a/src/routines/level2/xspmv.cc
+++ b/src/routines/level2/xspmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspmv.h"
+#include "routines/level2/xspmv.hpp"
 
 #include <string>
 #include <vector>
@@ -57,6 +57,7 @@ StatusCode Xspmv<T>::DoSpmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xspmv<half>;
 template class Xspmv<float>;
 template class Xspmv<double>;
 
diff --git a/include/internal/routines/level2/xspmv.h b/src/routines/level2/xspmv.hpp
index ca3e28b6..a0c69b85 100644
--- a/include/internal/routines/level2/xspmv.h
+++ b/src/routines/level2/xspmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XSPMV_H_
 #define CLBLAST_ROUTINES_XSPMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cpp
index 55af2f29..a75fe9c3 100644
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspr.h"
+#include "routines/level2/xspr.hpp"
 
 #include <string>
 
@@ -44,6 +44,7 @@ StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xspr<half>;
 template class Xspr<float>;
 template class Xspr<double>;
 
diff --git a/include/internal/routines/level2/xspr.h b/src/routines/level2/xspr.hpp
index 7e91abc5..6468c736 100644
--- a/include/internal/routines/level2/xspr.h
+++ b/src/routines/level2/xspr.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSPR_H_
 #define CLBLAST_ROUTINES_XSPR_H_
 
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cpp
index 9a3f97ce..c39a2eb4 100644
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xspr2.h"
+#include "routines/level2/xspr2.hpp"
 
 #include <string>
 
@@ -46,6 +46,7 @@ StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xspr2<half>;
 template class Xspr2<float>;
 template class Xspr2<double>;
 
diff --git a/include/internal/routines/level2/xspr2.h b/src/routines/level2/xspr2.hpp
index a34be8e8..693c56a1 100644
--- a/include/internal/routines/level2/xspr2.h
+++ b/src/routines/level2/xspr2.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSPR2_H_
 #define CLBLAST_ROUTINES_XSPR2_H_
 
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xsymv.cc b/src/routines/level2/xsymv.cpp
index a9eb284f..648d2a3e 100644
--- a/src/routines/level2/xsymv.cc
+++ b/src/routines/level2/xsymv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsymv.h"
+#include "routines/level2/xsymv.hpp"
 
 #include <string>
 #include <vector>
@@ -57,6 +57,7 @@ StatusCode Xsymv<T>::DoSymv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsymv<half>;
 template class Xsymv<float>;
 template class Xsymv<double>;
 
diff --git a/include/internal/routines/level2/xsymv.h b/src/routines/level2/xsymv.hpp
index 98a0ce88..67815f2f 100644
--- a/include/internal/routines/level2/xsymv.h
+++ b/src/routines/level2/xsymv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XSYMV_H_
 #define CLBLAST_ROUTINES_XSYMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cpp
index 4b3928e5..758d8f8f 100644
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsyr.h"
+#include "routines/level2/xsyr.hpp"
 
 #include <string>
 
@@ -43,6 +43,7 @@ StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsyr<half>;
 template class Xsyr<float>;
 template class Xsyr<double>;
 
diff --git a/include/internal/routines/level2/xsyr.h b/src/routines/level2/xsyr.hpp
index f88498ae..20393454 100644
--- a/include/internal/routines/level2/xsyr.h
+++ b/src/routines/level2/xsyr.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSYR_H_
 #define CLBLAST_ROUTINES_XSYR_H_
 
-#include "internal/routines/level2/xher.h"
+#include "routines/level2/xher.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cpp
index 3ae389e0..6f43b219 100644
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xsyr2.h"
+#include "routines/level2/xsyr2.hpp"
 
 #include <string>
 
@@ -45,6 +45,7 @@ StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsyr2<half>;
 template class Xsyr2<float>;
 template class Xsyr2<double>;
 
diff --git a/include/internal/routines/level2/xsyr2.h b/src/routines/level2/xsyr2.hpp
index d2d3143a..1a8dcbe8 100644
--- a/include/internal/routines/level2/xsyr2.h
+++ b/src/routines/level2/xsyr2.hpp
@@ -14,7 +14,7 @@
 #ifndef CLBLAST_ROUTINES_XSYR2_H_
 #define CLBLAST_ROUTINES_XSYR2_H_
 
-#include "internal/routines/level2/xher2.h"
+#include "routines/level2/xher2.hpp"
 
 namespace clblast {
 // =================================================================================================
diff --git a/src/routines/level2/xtbmv.cc b/src/routines/level2/xtbmv.cpp
index 47371c87..e315c544 100644
--- a/src/routines/level2/xtbmv.cc
+++ b/src/routines/level2/xtbmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtbmv.h"
+#include "routines/level2/xtbmv.hpp"
 
 #include <string>
 #include <vector>
@@ -72,6 +72,7 @@ StatusCode Xtbmv<T>::DoTbmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtbmv<half>;
 template class Xtbmv<float>;
 template class Xtbmv<double>;
 template class Xtbmv<float2>;
diff --git a/include/internal/routines/level2/xtbmv.h b/src/routines/level2/xtbmv.hpp
index 3b358080..389e9705 100644
--- a/include/internal/routines/level2/xtbmv.h
+++ b/src/routines/level2/xtbmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTBMV_H_
 #define CLBLAST_ROUTINES_XTBMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -25,12 +25,10 @@ namespace clblast {
 template <typename T>
 class Xtbmv: public Xgemv<T> {
  public:
-  
-  // Members from the base class
-  using Routine<T>::queue_;
-  using Routine<T>::context_;
 
   // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
   using Xgemv<T>::MatVec;
 
   // Constructor
diff --git a/src/routines/level2/xtpmv.cc b/src/routines/level2/xtpmv.cpp
index c63cb9b2..46811089 100644
--- a/src/routines/level2/xtpmv.cc
+++ b/src/routines/level2/xtpmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtpmv.h"
+#include "routines/level2/xtpmv.hpp"
 
 #include <string>
 #include <vector>
@@ -72,6 +72,7 @@ StatusCode Xtpmv<T>::DoTpmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtpmv<half>;
 template class Xtpmv<float>;
 template class Xtpmv<double>;
 template class Xtpmv<float2>;
diff --git a/include/internal/routines/level2/xtpmv.h b/src/routines/level2/xtpmv.hpp
index f306cf4a..0e8cf1d2 100644
--- a/include/internal/routines/level2/xtpmv.h
+++ b/src/routines/level2/xtpmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTPMV_H_
 #define CLBLAST_ROUTINES_XTPMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -25,12 +25,10 @@ namespace clblast {
 template <typename T>
 class Xtpmv: public Xgemv<T> {
  public:
-  
-  // Members from the base class
-  using Routine<T>::queue_;
-  using Routine<T>::context_;
 
   // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
   using Xgemv<T>::MatVec;
 
   // Constructor
diff --git a/src/routines/level2/xtrmv.cc b/src/routines/level2/xtrmv.cpp
index 9111d41d..d2f24252 100644
--- a/src/routines/level2/xtrmv.cc
+++ b/src/routines/level2/xtrmv.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level2/xtrmv.h"
+#include "routines/level2/xtrmv.hpp"
 
 #include <string>
 #include <vector>
@@ -72,6 +72,7 @@ StatusCode Xtrmv<T>::DoTrmv(const Layout layout, const Triangle triangle,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtrmv<half>;
 template class Xtrmv<float>;
 template class Xtrmv<double>;
 template class Xtrmv<float2>;
diff --git a/include/internal/routines/level2/xtrmv.h b/src/routines/level2/xtrmv.hpp
index cf0824a4..07dd7841 100644
--- a/include/internal/routines/level2/xtrmv.h
+++ b/src/routines/level2/xtrmv.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTRMV_H_
 #define CLBLAST_ROUTINES_XTRMV_H_
 
-#include "internal/routines/level2/xgemv.h"
+#include "routines/level2/xgemv.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -25,12 +25,10 @@ namespace clblast {
 template <typename T>
 class Xtrmv: public Xgemv<T> {
  public:
-  
-  // Members from the base class
-  using Routine<T>::queue_;
-  using Routine<T>::context_;
 
   // Uses the generic matrix-vector routine
+  using Xgemv<T>::queue_;
+  using Xgemv<T>::context_;
   using Xgemv<T>::MatVec;
 
   // Constructor
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cpp
index 3699b548..9ea5559c 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"
 
 #include <string>
 #include <vector>
@@ -19,23 +19,19 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xgemm<float>::precision_ = Precision::kSingle;
-template <> const Precision Xgemm<double>::precision_ = Precision::kDouble;
-template <> const Precision Xgemm<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
+    #include "../../kernels/level3/convert_symmetric.opencl"
+    #include "../../kernels/level3/convert_triangular.opencl"
+    #include "../../kernels/level3/convert_hermitian.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
@@ -62,27 +58,27 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of
   // col-major) to be transformed, so transposing requirements are not the same as whether or not
   // the matrix is actually transposed in memory.
-  auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
-  auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
-                   (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
-  auto c_rotated = (layout == Layout::kRowMajor);
-  auto a_do_transpose =  a_rotated;
-  auto b_do_transpose = !b_rotated;
-  auto c_do_transpose =  c_rotated;
+  const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) ||
+                         (layout == Layout::kRowMajor && a_transpose == Transpose::kNo);
+  const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
+                         (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
+  const auto c_rotated = (layout == Layout::kRowMajor);
+  const auto a_do_transpose =  a_rotated;
+  const auto b_do_transpose = !b_rotated;
+  const auto c_do_transpose =  c_rotated;
 
   // In case of complex data-types, the transpose can also become a conjugate transpose
-  auto a_conjugate = (a_transpose == Transpose::kConjugate);
-  auto b_conjugate = (b_transpose == Transpose::kConjugate);
+  const auto a_conjugate = (a_transpose == Transpose::kConjugate);
+  const auto b_conjugate = (b_transpose == Transpose::kConjugate);
 
   // Computes the first and second dimensions of the 3 matrices taking into account whether the
   // matrices are rotated or not
-  auto a_one = (a_rotated) ? k : m;
-  auto a_two = (a_rotated) ? m : k;
-  auto b_one = (b_rotated) ? n : k;
-  auto b_two = (b_rotated) ? k : n;
-  auto c_one = (c_rotated) ? n : m;
-  auto c_two = (c_rotated) ? m : n;
+  const auto a_one = (a_rotated) ? k : m;
+  const auto a_two = (a_rotated) ? m : k;
+  const auto b_one = (b_rotated) ? n : k;
+  const auto b_two = (b_rotated) ? k : n;
+  const auto c_one = (c_rotated) ? n : m;
+  const auto c_two = (c_rotated) ? m : n;
 
   // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and
   // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the
@@ -91,23 +87,23 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   //    matrix A cannot be less than K when rotated, or less than M when not-rotated
   //    matrix B cannot be less than N when rotated, or less than K when not-rotated
   //    matrix C cannot be less than N when rotated, or less than M when not-rotated
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld, sizeof(T));
+  status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
   // Calculates the ceiled versions of m, n, and k
-  auto m_ceiled = Ceil(m, db_["MWG"]);
-  auto n_ceiled = Ceil(n, db_["NWG"]);
-  auto k_ceiled = Ceil(k, db_["KWG"]);
+  const auto m_ceiled = Ceil(m, db_["MWG"]);
+  const auto n_ceiled = Ceil(n, db_["NWG"]);
+  const auto k_ceiled = Ceil(k, db_["KWG"]);
 
   // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
   try {
 
     // Loads the program from the database
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
     auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
@@ -118,9 +114,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
                      c_do_transpose == false;
 
     // Creates the temporary matrices
-    auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled);
-    auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
+    const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled);
+    const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+    const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
+
+    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    auto alpha_buffer = Buffer<T>(context_, 1);
+    auto beta_buffer = Buffer<T>(context_, 1);
+    alpha_buffer.Write(queue_, 1, &alpha);
+    beta_buffer.Write(queue_, 1, &beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -131,10 +133,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
-                                      program, true, a_do_transpose, a_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, a_do_transpose, a_conjugate);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessA);
     }
@@ -142,10 +145,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // As above, but now for matrix B
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
                                       b_one, b_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      program, true, b_do_transpose, b_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, b_do_transpose, b_conjugate);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessB);
     }
@@ -153,10 +157,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // As above, but now for matrix C. This is only necessary if C is used both as input and output.
     if (!c_no_temp && beta != static_cast<T>(0)) {
       auto eventProcessC = Event();
-      status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
                                       c_one, c_two, c_ld, c_offset, c_buffer,
                                       m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
-                                      program, true, c_do_transpose, false);
+                                      ConstantOne<T>(), program,
+                                      true, c_do_transpose, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessC);
     }
@@ -169,32 +174,33 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       kernel.SetArgument(0, static_cast<int>(m_ceiled));
       kernel.SetArgument(1, static_cast<int>(n_ceiled));
       kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, alpha);
-      kernel.SetArgument(4, beta);
+      kernel.SetArgument(3, alpha_buffer());
+      kernel.SetArgument(4, beta_buffer());
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, b_temp());
       kernel.SetArgument(7, c_temp());
 
       // Computes the global and local thread sizes
-      auto global = std::vector<size_t>{
+      const auto global = std::vector<size_t>{
         (m_ceiled * db_["MDIMC"]) / db_["MWG"],
         (n_ceiled * db_["NDIMC"]) / db_["NWG"]
       };
-      auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
+      const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
 
       // Launches the kernel
       auto eventKernel = Event();
       auto eventPointer = (!c_no_temp) ? eventKernel.pointer() : event_;
-      status = RunKernel(kernel, global, local, eventPointer, eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventPointer, eventWaitList);
       if (ErrorIn(status)) { return status; }
 
       // Runs the post-processing kernel if needed
       if (!c_no_temp) {
         eventWaitList.push_back(eventKernel);
-        status = PadCopyTransposeMatrix(event_, eventWaitList,
+        status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
                                         m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
                                         c_one, c_two, c_ld, c_offset, c_buffer,
-                                        program, false, c_do_transpose, false);
+                                        ConstantOne<T>(), program,
+                                        false, c_do_transpose, false);
         if (ErrorIn(status)) { return status; }
       }
 
@@ -207,6 +213,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
 // =================================================================================================
 
 // Compiles the templated class
+template class Xgemm<half>;
 template class Xgemm<float>;
 template class Xgemm<double>;
 template class Xgemm<float2>;
diff --git a/include/internal/routines/level3/xgemm.h b/src/routines/level3/xgemm.hpp
index 85fb0616..bc51c7f5 100644
--- a/include/internal/routines/level3/xgemm.h
+++ b/src/routines/level3/xgemm.hpp
@@ -14,30 +14,16 @@
 #ifndef CLBLAST_ROUTINES_XGEMM_H_
 #define CLBLAST_ROUTINES_XGEMM_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xgemm: public Routine<T> {
+class Xgemm: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::PadCopyTransposeMatrix;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixB;
-  using Routine<T>::TestMatrixC;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xgemm(Queue &queue, EventPointer event, const std::string &name = "GEMM");
 
@@ -49,10 +35,6 @@ class Xgemm: public Routine<T> {
                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                     const T beta,
                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cpp
index d2fbf36e..9813503e 100644
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xhemm.h"
+#include "routines/level3/xhemm.hpp"
 
 #include <string>
 #include <vector>
@@ -45,7 +45,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
   auto k = (side == Side::kLeft) ? m : n;
 
   // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
 
   // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
@@ -61,7 +61,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
     // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
     // routine afterwards
     try {
-      const auto program = GetProgramFromCache();
+      const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
       auto kernel = Kernel(program, kernel_name);
 
       // Sets the arguments for the hermitian-to-squared kernel
@@ -80,7 +80,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
                                         Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
       auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
       auto kernelEvent = Event();
-      status = RunKernel(kernel, global, local, kernelEvent.pointer());
+      status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
       if (ErrorIn(status)) { return status; }
 
       // Synchronize now: 'DoGemm' does not accept a list of events to wait for
diff --git a/include/internal/routines/level3/xhemm.h b/src/routines/level3/xhemm.hpp
index ec42b569..272bd2ec 100644
--- a/include/internal/routines/level3/xhemm.h
+++ b/src/routines/level3/xhemm.hpp
@@ -15,7 +15,7 @@
 #ifndef CLBLAST_ROUTINES_XHEMM_H_
 #define CLBLAST_ROUTINES_XHEMM_H_
 
-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -25,15 +25,12 @@ template <typename T>
 class Xhemm: public Xgemm<T> {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
-  // Uses the regular Xgemm routine
+  // Uses methods and variables the regular Xgemm routine
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
   using Xgemm<T>::DoGemm;
 
   // Constructor
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cpp
index 2c2c815d..bd7a053e 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xher2k.h"
+#include "routines/level3/xher2k.hpp"
 
 #include <string>
 #include <vector>
@@ -19,21 +19,16 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xher2k<float2,float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xher2k<double2,double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
@@ -75,11 +70,11 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
   //    matrix A cannot be less than N when rotated, or less than K when not-rotated
   //    matrix B cannot be less than N when rotated, or less than K when not-rotated
   //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
   // Calculates the ceiled versions of n and k
@@ -93,7 +88,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
   try {
 
     // Loads the program from the database
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
     auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
@@ -112,6 +107,13 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
+    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    auto complex_beta = T{beta, static_cast<U>(0.0)};
+    auto alpha_buffer = Buffer<T>(context_, 1);
+    auto beta_buffer = Buffer<T>(context_, 1);
+    alpha_buffer.Write(queue_, 1, &alpha);
+    beta_buffer.Write(queue_, 1, &complex_beta);
+
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -121,37 +123,41 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     // case nothing has to be done, these kernels can be skipped.
     if (!a1_no_temp) {
       auto eventProcessA1 = Event();
-      status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
-                                      program, true, ab_rotated, ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, ab_conjugate);
       eventWaitList.push_back(eventProcessA1);
       if (ErrorIn(status)) { return status; }
     }
     if (!a2_no_temp) {
       auto eventProcessA2 = Event();
-      status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
-                                      program, true, ab_rotated, !ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, !ab_conjugate);
       eventWaitList.push_back(eventProcessA2);
       if (ErrorIn(status)) { return status; }
     }
     if (!b1_no_temp) {
       auto eventProcessB1 = Event();
-      status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
-                                      program, true, ab_rotated, ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, ab_conjugate);
       eventWaitList.push_back(eventProcessB1);
       if (ErrorIn(status)) { return status; }
     }
     if (!b2_no_temp) {
       auto eventProcessB2 = Event();
-      status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
-                                      program, true, ab_rotated, !ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, !ab_conjugate);
       eventWaitList.push_back(eventProcessB2);
       if (ErrorIn(status)) { return status; }
     }
@@ -159,10 +165,11 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     eventWaitList.push_back(eventProcessC);
     if (ErrorIn(status)) { return status; }
 
@@ -171,11 +178,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       auto kernel = Kernel(program, kernel_name);
 
       // Sets the kernel arguments
-      auto complex_beta = T{beta, static_cast<U>(0.0)};
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(2, alpha_buffer());
+      kernel.SetArgument(3, beta_buffer());
       kernel.SetArgument(4, a1_temp());
       kernel.SetArgument(5, b2_temp());
       kernel.SetArgument(6, c_temp());
@@ -189,31 +195,34 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
 
       // Launches the kernel
       auto eventKernel1 = Event();
-      status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventKernel1);
 
       // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
       auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
       auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      kernel.SetArgument(2, conjugate_alpha);
-      kernel.SetArgument(3, complex_one);
+      alpha_buffer.Write(queue_, 1, &conjugate_alpha);
+      beta_buffer.Write(queue_, 1, &complex_one);
+      kernel.SetArgument(2, alpha_buffer());
+      kernel.SetArgument(3, beta_buffer());
       kernel.SetArgument(4, b1_temp());
       kernel.SetArgument(5, a2_temp());
 
       // Runs the kernel again
       auto eventKernel2 = Event();
-      status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventKernel2);
 
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, true);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, true);
       if (ErrorIn(status)) { return status; }
 
       // Successfully finished the computation
diff --git a/include/internal/routines/level3/xher2k.h b/src/routines/level3/xher2k.hpp
index 623afd49..23996219 100644
--- a/include/internal/routines/level3/xher2k.h
+++ b/src/routines/level3/xher2k.hpp
@@ -16,30 +16,16 @@
 #ifndef CLBLAST_ROUTINES_XHER2K_H_
 #define CLBLAST_ROUTINES_XHER2K_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T, typename U>
-class Xher2k: public Routine<T> {
+class Xher2k: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::PadCopyTransposeMatrix;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixB;
-  using Routine<T>::TestMatrixC;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xher2k(Queue &queue, EventPointer event, const std::string &name = "HER2K");
 
@@ -51,10 +37,6 @@ class Xher2k: public Routine<T> {
                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                      const U beta,
                      const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cpp
index 414c4760..6ef7f21f 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xherk.h"
+#include "routines/level3/xherk.hpp"
 
 #include <string>
 #include <vector>
@@ -19,21 +19,16 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xherk<float2,float>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xherk<double2,double>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T, typename U>
 Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
@@ -74,9 +69,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
   // space. Also tests that the leading dimensions of:
   //    matrix A cannot be less than N when rotated, or less than K when not-rotated
   //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
   // Calculates the ceiled versions of n and k
@@ -90,7 +85,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
   try {
 
     // Loads the program from the database
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
     auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
@@ -103,6 +98,14 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
+    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+    auto complex_beta = T{beta, static_cast<U>(0.0)};
+    auto alpha_buffer = Buffer<T>(context_, 1);
+    auto beta_buffer = Buffer<T>(context_, 1);
+    alpha_buffer.Write(queue_, 1, &complex_alpha);
+    beta_buffer.Write(queue_, 1, &complex_beta);
+
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -112,19 +115,21 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     // case nothing has to be done, these kernels can be skipped. Two copies are created.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      program, true, a_rotated, a_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, a_rotated, a_conjugate);
       eventWaitList.push_back(eventProcessA);
       if (ErrorIn(status)) { return status; }
     }
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      program, true, a_rotated, b_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, a_rotated, b_conjugate);
       eventWaitList.push_back(eventProcessB);
       if (ErrorIn(status)) { return status; }
     }
@@ -132,10 +137,11 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     eventWaitList.push_back(eventProcessC);
     if (ErrorIn(status)) { return status; }
 
@@ -144,12 +150,10 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       auto kernel = Kernel(program, kernel_name);
 
       // Sets the kernel arguments
-      auto complex_alpha = T{alpha, static_cast<U>(0.0)};
-      auto complex_beta = T{beta, static_cast<U>(0.0)};
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, complex_alpha);
-      kernel.SetArgument(3, complex_beta);
+      kernel.SetArgument(2, alpha_buffer());
+      kernel.SetArgument(3, beta_buffer());
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
@@ -163,17 +167,18 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
 
       // Launches the kernel
       auto eventKernel = Event();
-      status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventKernel);
 
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, true);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, true);
       if (ErrorIn(status)) { return status; }
 
       // Successfully finished the computation
diff --git a/include/internal/routines/level3/xherk.h b/src/routines/level3/xherk.hpp
index 629695ff..3f156a1b 100644
--- a/include/internal/routines/level3/xherk.h
+++ b/src/routines/level3/xherk.hpp
@@ -16,29 +16,16 @@
 #ifndef CLBLAST_ROUTINES_XHERK_H_
 #define CLBLAST_ROUTINES_XHERK_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T, typename U>
-class Xherk: public Routine<T> {
+class Xherk: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::PadCopyTransposeMatrix;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixC;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xherk(Queue &queue, EventPointer event, const std::string &name = "HERK");
 
@@ -49,10 +36,6 @@ class Xherk: public Routine<T> {
                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                     const U beta,
                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cpp
index a39026f1..04e4b718 100644
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xsymm.h"
+#include "routines/level3/xsymm.hpp"
 
 #include <string>
 #include <vector>
@@ -45,7 +45,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
   auto k = (side == Side::kLeft) ? m : n;
 
   // Checks for validity of the squared A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
 
   // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
@@ -61,7 +61,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
     // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
     // routine afterwards
     try {
-      const auto program = GetProgramFromCache();
+      const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
       auto kernel = Kernel(program, kernel_name);
 
       // Sets the arguments for the symmetric-to-squared kernel
@@ -80,7 +80,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
                                         Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
       auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
       auto kernelEvent = Event();
-      status = RunKernel(kernel, global, local, kernelEvent.pointer());
+      status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
       if (ErrorIn(status)) { return status; }
 
       // Synchronize now: 'DoGemm' does not accept a list of events to wait for
@@ -127,6 +127,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsymm<half>;
 template class Xsymm<float>;
 template class Xsymm<double>;
 template class Xsymm<float2>;
diff --git a/include/internal/routines/level3/xsymm.h b/src/routines/level3/xsymm.hpp
index 16ad6f53..428f78ef 100644
--- a/include/internal/routines/level3/xsymm.h
+++ b/src/routines/level3/xsymm.hpp
@@ -17,7 +17,7 @@
 #ifndef CLBLAST_ROUTINES_XSYMM_H_
 #define CLBLAST_ROUTINES_XSYMM_H_
 
-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -27,15 +27,12 @@ template <typename T>
 class Xsymm: public Xgemm<T> {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
-  // Uses the regular Xgemm routine
+  // Uses methods and variables the regular Xgemm routine
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
   using Xgemm<T>::DoGemm;
 
   // Constructor
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cpp
index 3206c669..424d4d2d 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xsyr2k.h"
+#include "routines/level3/xsyr2k.hpp"
 
 #include <string>
 #include <vector>
@@ -19,23 +19,16 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xsyr2k<float>::precision_ = Precision::kSingle;
-template <> const Precision Xsyr2k<double>::precision_ = Precision::kDouble;
-template <> const Precision Xsyr2k<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
@@ -73,11 +66,11 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
   //    matrix A cannot be less than N when rotated, or less than K when not-rotated
   //    matrix B cannot be less than N when rotated, or less than K when not-rotated
   //    matrix C cannot be less than N
-  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(ab_one, ab_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld, sizeof(T));
+  status = TestMatrixB(ab_one, ab_two, b_buffer, b_offset, b_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
   // Calculates the ceiled versions of n and k
@@ -91,7 +84,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
   try {
 
     // Loads the program from the database
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
     auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
@@ -104,6 +97,12 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
+    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    auto alpha_buffer = Buffer<T>(context_, 1);
+    auto beta_buffer = Buffer<T>(context_, 1);
+    alpha_buffer.Write(queue_, 1, &alpha);
+    beta_buffer.Write(queue_, 1, &beta);
+
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -113,19 +112,21 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      program, true, ab_rotated, false);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessA);
     }
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      program, true, ab_rotated, false);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessB);
     }
@@ -133,10 +134,11 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(eventProcessC);
 
@@ -147,8 +149,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, beta);
+      kernel.SetArgument(2, alpha_buffer());
+      kernel.SetArgument(3, beta_buffer());
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
@@ -162,29 +164,31 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
 
       // Launches the kernel
       auto eventKernel1 = Event();
-      status = RunKernel(kernel, global, local, eventKernel1.pointer(), eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventKernel1.pointer(), eventWaitList);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventKernel1);
 
       // Swaps the arguments for matrices A and B, and sets 'beta' to 1
       auto one = static_cast<T>(1);
-      kernel.SetArgument(3, one);
+      beta_buffer.Write(queue_, 1, &one);
+      kernel.SetArgument(3, beta_buffer());
       kernel.SetArgument(4, b_temp());
       kernel.SetArgument(5, a_temp());
 
       // Runs the kernel again
       auto eventKernel2 = Event();
-      status = RunKernel(kernel, global, local, eventKernel2.pointer(), eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventKernel2.pointer(), eventWaitList);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventKernel2);
 
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, false);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, false);
       if (ErrorIn(status)) { return status; }
 
       // Successfully finished the computation
@@ -196,6 +200,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsyr2k<half>;
 template class Xsyr2k<float>;
 template class Xsyr2k<double>;
 template class Xsyr2k<float2>;
diff --git a/include/internal/routines/level3/xsyr2k.h b/src/routines/level3/xsyr2k.hpp
index 88669626..56185653 100644
--- a/include/internal/routines/level3/xsyr2k.h
+++ b/src/routines/level3/xsyr2k.hpp
@@ -16,30 +16,16 @@
 #ifndef CLBLAST_ROUTINES_XSYR2K_H_
 #define CLBLAST_ROUTINES_XSYR2K_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xsyr2k: public Routine<T> {
+class Xsyr2k: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::PadCopyTransposeMatrix;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixB;
-  using Routine<T>::TestMatrixC;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xsyr2k(Queue &queue, EventPointer event, const std::string &name = "SYR2K");
 
@@ -51,10 +37,6 @@ class Xsyr2k: public Routine<T> {
                      const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                      const T beta,
                      const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cpp
index 741ad064..f56c232b 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xsyrk.h"
+#include "routines/level3/xsyrk.hpp"
 
 #include <string>
 #include <vector>
@@ -19,23 +19,16 @@
 namespace clblast {
 // =================================================================================================
 
-// Specific implementations to get the memory-type based on a template argument
-template <> const Precision Xsyrk<float>::precision_ = Precision::kSingle;
-template <> const Precision Xsyrk<double>::precision_ = Precision::kDouble;
-template <> const Precision Xsyrk<float2>::precision_ = Precision::kComplexSingle;
-template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDouble;
-
-// =================================================================================================
-
 // Constructor: forwards to base class constructor
 template <typename T>
 Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
-    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, precision_) {
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
   source_string_ =
-    #include "../../kernels/level3/copy.opencl"
-    #include "../../kernels/level3/pad.opencl"
-    #include "../../kernels/level3/transpose.opencl"
-    #include "../../kernels/level3/padtranspose.opencl"
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
   ;
@@ -71,9 +64,9 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
   // space. Also tests that the leading dimensions of:
   //    matrix A cannot be less than N when rotated, or less than K when not-rotated
   //    matrix C cannot be less than N
-  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
-  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld, sizeof(T));
+  status = TestMatrixC(n, n, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
   // Calculates the ceiled versions of n and k
@@ -87,7 +80,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
   try {
 
     // Loads the program from the database
-    const auto program = GetProgramFromCache();
+    const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
     auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 &&
@@ -97,6 +90,12 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
+    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    auto alpha_buffer = Buffer<T>(context_, 1);
+    auto beta_buffer = Buffer<T>(context_, 1);
+    alpha_buffer.Write(queue_, 1, &alpha);
+    beta_buffer.Write(queue_, 1, &beta);
+
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -106,10 +105,11 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      program, true, a_rotated, false);
+                                      ConstantOne<T>(), program,
+                                      true, a_rotated, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessA);
     }
@@ -117,10 +117,11 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(eventProcessC);
 
@@ -131,8 +132,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha);
-      kernel.SetArgument(3, beta);
+      kernel.SetArgument(2, alpha_buffer());
+      kernel.SetArgument(3, beta_buffer());
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, c_temp());
@@ -146,17 +147,18 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
 
       // Launches the kernel
       auto eventKernel = Event();
-      status = RunKernel(kernel, global, local, eventKernel.pointer(), eventWaitList);
+      status = RunKernel(kernel, queue_, device_, global, local, eventKernel.pointer(), eventWaitList);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventKernel);
 
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, false);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, false);
       if (ErrorIn(status)) { return status; }
 
 
@@ -169,6 +171,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
 // =================================================================================================
 
 // Compiles the templated class
+template class Xsyrk<half>;
 template class Xsyrk<float>;
 template class Xsyrk<double>;
 template class Xsyrk<float2>;
diff --git a/include/internal/routines/level3/xsyrk.h b/src/routines/level3/xsyrk.hpp
index e95c7c1c..7c075c26 100644
--- a/include/internal/routines/level3/xsyrk.h
+++ b/src/routines/level3/xsyrk.hpp
@@ -18,29 +18,16 @@
 #ifndef CLBLAST_ROUTINES_XSYRK_H_
 #define CLBLAST_ROUTINES_XSYRK_H_
 
-#include "internal/routine.h"
+#include "routine.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
 template <typename T>
-class Xsyrk: public Routine<T> {
+class Xsyrk: public Routine {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::source_string_;
-  using Routine<T>::queue_;
-  using Routine<T>::event_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::PadCopyTransposeMatrix;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::TestMatrixC;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
   // Constructor
   Xsyrk(Queue &queue, EventPointer event, const std::string &name = "SYRK");
 
@@ -51,10 +38,6 @@ class Xsyrk: public Routine<T> {
                     const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
                     const T beta,
                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
-
- private:
-  // Static variable to get the precision
-  const static Precision precision_;
 };
 
 // =================================================================================================
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cpp
index 9e3b27b4..74a82822 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/routines/level3/xtrmm.h"
+#include "routines/level3/xtrmm.hpp"
 
 #include <string>
 #include <vector>
@@ -44,14 +44,14 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
   auto k = (side == Side::kLeft) ? m : n;
 
   // Checks for validity of the triangular A matrix
-  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld, sizeof(T));
+  auto status = TestMatrixA(k, k, a_buffer, a_offset, a_ld);
   if (ErrorIn(status)) { return status; }
 
   // Determines which kernel to run based on the layout (the Xgemm kernel assumes column-major as
   // default) and on whether we are dealing with an upper or lower triangle of the triangular matrix
   bool is_upper = ((triangle == Triangle::kUpper && layout != Layout::kRowMajor) ||
                    (triangle == Triangle::kLower && layout == Layout::kRowMajor));
-  auto kernel_name = (is_upper) ? "TrmmUpperToSquared" : "TrmmLowerToSquared";
+  auto kernel_name = (is_upper) ? "TriaUpperToSquared" : "TriaLowerToSquared";
 
   // Determines whether or not the triangular matrix is unit-diagonal
   auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
@@ -63,7 +63,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
     // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
     // routine afterwards
     try {
-      const auto program = GetProgramFromCache();
+      const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
       auto kernel = Kernel(program, kernel_name);
 
       // Sets the arguments for the triangular-to-squared kernel
@@ -83,7 +83,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
                                         Ceil(CeilDiv(k, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
       auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
       auto kernelEvent = Event();
-      status = RunKernel(kernel, global, local, kernelEvent.pointer());
+      status = RunKernel(kernel, queue_, device_, global, local, kernelEvent.pointer());
       if (ErrorIn(status)) { return status; }
 
       // Synchronize now: 'DoGemm' does not accept a list of events to wait for
@@ -130,6 +130,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
 // =================================================================================================
 
 // Compiles the templated class
+template class Xtrmm<half>;
 template class Xtrmm<float>;
 template class Xtrmm<double>;
 template class Xtrmm<float2>;
diff --git a/include/internal/routines/level3/xtrmm.h b/src/routines/level3/xtrmm.hpp
index 01f6594d..186a120e 100644
--- a/include/internal/routines/level3/xtrmm.h
+++ b/src/routines/level3/xtrmm.hpp
@@ -16,7 +16,7 @@
 #ifndef CLBLAST_ROUTINES_XTRMM_H_
 #define CLBLAST_ROUTINES_XTRMM_H_
 
-#include "internal/routines/level3/xgemm.h"
+#include "routines/level3/xgemm.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -26,15 +26,12 @@ template <typename T>
 class Xtrmm: public Xgemm<T> {
  public:
 
-  // Members and methods from the base class
-  using Routine<T>::db_;
-  using Routine<T>::context_;
-  using Routine<T>::GetProgramFromCache;
-  using Routine<T>::TestMatrixA;
-  using Routine<T>::RunKernel;
-  using Routine<T>::ErrorIn;
-
-  // Uses the regular Xgemm routine
+  // Uses methods and variables the regular Xgemm routine
+  using Xgemm<T>::routine_name_;
+  using Xgemm<T>::queue_;
+  using Xgemm<T>::context_;
+  using Xgemm<T>::device_;
+  using Xgemm<T>::db_;
   using Xgemm<T>::DoGemm;
 
   // Constructor
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
new file mode 100644
index 00000000..e8593301
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -0,0 +1,94 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "routines/levelx/xomatcopy.hpp"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
+    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, PrecisionValue<T>()) {
+  source_string_ =
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
+                                    const size_t m, const size_t n, const T alpha,
+                                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to transpose the matrix A
+  const auto transpose = (a_transpose != Transpose::kNo);
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  const auto conjugate = (a_transpose == Transpose::kConjugate);
+
+  // Computes the dimensions of the two matrices
+  const auto rotated = (layout == Layout::kRowMajor);
+  const auto a_one = (rotated) ? n : m;
+  const auto a_two = (rotated) ? m : n;
+  const auto b_one = (transpose) ? a_two : a_one;
+  const auto b_two = (transpose) ? a_one : a_two;
+
+  // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their
+  // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL
+  // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space.
+  // Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than M when not-rotated
+  //    matrix B cannot be less than M when rotated, or less than N when not-rotated
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld);
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld);
+  if (ErrorIn(status)) { return status; }
+
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+  auto emptyEventList = std::vector<Event>();
+  status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList,
+                                  a_one, a_two, a_ld, a_offset, a_buffer,
+                                  b_one, b_two, b_ld, b_offset, b_buffer,
+                                  alpha, program, false, transpose, conjugate);
+  if (ErrorIn(status)) { return status; }
+
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xomatcopy<half>;
+template class Xomatcopy<float>;
+template class Xomatcopy<double>;
+template class Xomatcopy<float2>;
+template class Xomatcopy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/levelx/xomatcopy.hpp b/src/routines/levelx/xomatcopy.hpp
new file mode 100644
index 00000000..0e580230
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.hpp
@@ -0,0 +1,41 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_ROUTINES_XOMATCOPY_H_
+
+#include "routine.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xomatcopy: public Routine {
+ public:
+
+  // Constructor
+  Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n, const T alpha,
+                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XOMATCOPY_H_
+#endif
diff --git a/src/tuning/copy.cc b/src/tuning/kernels/copy_fast.cpp
index e2837e60..34269bc7 100644
--- a/src/tuning/copy.cc
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -27,16 +27,17 @@ class TuneCopy {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "copy"; }
-  static std::string KernelName() { return "CopyMatrix"; }
+  static std::string KernelName() { return "CopyMatrixFast"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/copy.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/copy_fast.opencl"
     ;
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -85,9 +86,11 @@ class TuneCopy {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
   }
 
   // Describes how to compute the performance metrics
@@ -107,7 +110,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneCopy<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneCopy<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneCopy<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneCopy<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/pad.cc b/src/tuning/kernels/copy_pad.cpp
index 72729422..1e0dccd3 100644
--- a/src/tuning/pad.cc
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -27,16 +27,17 @@ class TunePad {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "pad"; }
-  static std::string KernelName() { return "PadMatrix"; }
+  static std::string KernelName() { return "CopyPadMatrix"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/pad.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/copy_pad.opencl"
     ;
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -85,17 +86,19 @@ class TunePad {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-  tuner.AddArgumentScalar(static_cast<int>(args.m));
-  tuner.AddArgumentScalar(static_cast<int>(args.n));
-  tuner.AddArgumentScalar(static_cast<int>(args.m));
-  tuner.AddArgumentScalar(0);
-  tuner.AddArgumentInput(a_mat);
-  tuner.AddArgumentScalar(static_cast<int>(args.m));
-  tuner.AddArgumentScalar(static_cast<int>(args.n));
-  tuner.AddArgumentScalar(static_cast<int>(args.m));
-  tuner.AddArgumentScalar(0);
-  tuner.AddArgumentOutput(b_mat);
-  tuner.AddArgumentScalar(0);
+    auto alpha_buffer = std::vector<T>{args.alpha};
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(0);
   }
 
   // Describes how to compute the performance metrics
@@ -115,7 +118,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePad<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePad<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePad<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePad<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/transpose.cc b/src/tuning/kernels/transpose_fast.cpp
index 113e0a81..7ac19cb6 100644
--- a/src/tuning/transpose.cc
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -27,16 +27,17 @@ class TuneTranspose {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "transpose"; }
-  static std::string KernelName() { return "TransposeMatrix"; }
+  static std::string KernelName() { return "TransposeMatrixFast"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/transpose.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/transpose_fast.opencl"
     ;
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -90,9 +91,11 @@ class TuneTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
   }
 
   // Describes how to compute the performance metrics
@@ -112,7 +115,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneTranspose<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneTranspose<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneTranspose<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneTranspose<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/padtranspose.cc b/src/tuning/kernels/transpose_pad.cpp
index 5edd89e0..63274415 100644
--- a/src/tuning/padtranspose.cc
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -27,16 +27,17 @@ class TunePadTranspose {
 
   // The representative kernel and the source code
   static std::string KernelFamily() { return "padtranspose"; }
-  static std::string KernelName() { return "PadTransposeMatrix"; }
+  static std::string KernelName() { return "TransposePadMatrix"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/padtranspose.opencl"
+      #include "../src/kernels/level3/level3.opencl"
+      #include "../src/kernels/level3/transpose_pad.opencl"
     ;
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -89,6 +90,7 @@ class TunePadTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -99,6 +101,7 @@ class TunePadTranspose {
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
     tuner.AddArgumentScalar(0);
   }
 
@@ -119,7 +122,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePadTranspose<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePadTranspose<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePadTranspose<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePadTranspose<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/xaxpy.cc b/src/tuning/kernels/xaxpy.cpp
index 31aa6a8e..88d12c1f 100644
--- a/src/tuning/xaxpy.cc
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -89,8 +89,9 @@ class TuneXaxpy {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(args.alpha);
+    tuner.AddArgumentInput(alpha_buffer);
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentOutput(y_vec);
   }
@@ -112,7 +113,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXaxpy<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXaxpy<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXaxpy<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXaxpy<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/xdot.cc b/src/tuning/kernels/xdot.cpp
index cff656c3..1581e13f 100644
--- a/src/tuning/xdot.cc
+++ b/src/tuning/kernels/xdot.cpp
@@ -15,8 +15,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -119,7 +119,7 @@ using double2 = clblast::double2;
 template <int V>
 void StartVariation(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXdot<half, V>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXdot<float, V>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXdot<double, V>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXdot<float2, V>, float2>(argc, argv); break;
diff --git a/src/tuning/xgemm.cc b/src/tuning/kernels/xgemm.cpp
index 2b4ff456..4b1efdef 100644
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/kernels/xgemm.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -121,11 +121,13 @@ class TuneXgemm {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
+    auto beta_buffer = std::vector<T>{args.beta};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentScalar(args.alpha);
-    tuner.AddArgumentScalar(args.beta);
+    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentInput(beta_buffer);
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentInput(b_mat);
     tuner.AddArgumentOutput(c_mat);
@@ -148,7 +150,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/xgemv.cc b/src/tuning/kernels/xgemv.cpp
index 43369c3b..d42155ae 100644
--- a/src/tuning/xgemv.cc
+++ b/src/tuning/kernels/xgemv.cpp
@@ -17,8 +17,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -96,11 +96,13 @@ class TuneXgemv {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
+    auto beta_buffer = std::vector<T>{args.beta};
     auto a_rotated = (V==3) ? 1 : 0;
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(args.alpha);
-    tuner.AddArgumentScalar(args.beta);
+    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentInput(beta_buffer);
     tuner.AddArgumentScalar(static_cast<int>(a_rotated));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentScalar(0);
@@ -135,7 +137,7 @@ using double2 = clblast::double2;
 template <int V>
 void StartVariation(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemv<half,V>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemv<float,V>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemv<double,V>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemv<float2,V>, float2>(argc, argv); break;
diff --git a/src/tuning/xger.cc b/src/tuning/kernels/xger.cpp
index 39efdb81..d2590c53 100644
--- a/src/tuning/xger.cc
+++ b/src/tuning/kernels/xger.cpp
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 
-#include "internal/utilities.h"
-#include "internal/tuning.h"
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -85,9 +85,10 @@ class TuneXger {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentScalar(args.alpha);
+    tuner.AddArgumentInput(alpha_buffer);
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentScalar(0); // x_offset
     tuner.AddArgumentScalar(1); // x_increment
@@ -117,7 +118,7 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXger<half>, half>(argc, argv); break;
     case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
     case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
     case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;
diff --git a/include/internal/tuning.h b/src/tuning/tuning.hpp
index 215beb59..19df5f9a 100644
--- a/include/internal/tuning.h
+++ b/src/tuning/tuning.hpp
@@ -20,6 +20,8 @@
 
 #include <cltune.h>
 
+#include "utilities.hpp"
+
 namespace clblast {
 // =================================================================================================
 
@@ -50,6 +52,7 @@ void Tuner(int argc, char* argv[]) {
 
   // Tests for validity of the precision and retrieves properties
   auto isAMD = false;
+  auto isARM = false;
   auto isGPU = false;
   {
     const auto platform = Platform(args.platform_id);
@@ -58,8 +61,9 @@ void Tuner(int argc, char* argv[]) {
       printf("* Unsupported precision, skipping this tuning run\n\n");
       return;
     }
-    isAMD = device.Vendor() == "AMD" || device.Vendor() == "Advanced Micro Devices, Inc.";
-    isGPU = device.Type() == "GPU";
+    isAMD = device.IsAMD();
+    isARM = device.IsARM();
+    isGPU = device.IsGPU();
   }
 
   // Creates input buffers with random data
@@ -94,6 +98,9 @@ void Tuner(int argc, char* argv[]) {
     defines += "#define USE_CL_MAD 1\n";
     defines += "#define USE_STAGGERED_INDICES 1\n";
   }
+  if (isARM && isGPU) {
+    defines += "#define GLOBAL_MEM_FENCE 1\n";
+  }
 
   // Loads the kernel sources and defines the kernel to tune
   auto sources = defines + C::GetSources();
diff --git a/src/utilities.cc b/src/utilities.cpp
index 68a4f02a..68e480c5 100644
--- a/src/utilities.cc
+++ b/src/utilities.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 #include <string>
 #include <vector>
@@ -22,6 +22,56 @@
 namespace clblast {
 // =================================================================================================
 
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar() {
+  return static_cast<T>(2.0);
+}
+template float GetScalar<float>();
+template double GetScalar<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half GetScalar() {
+  return FloatToHalf(2.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 GetScalar() {
+  return {2.0f, 0.5f};
+}
+template <>
+double2 GetScalar() {
+  return {2.0, 0.5};
+}
+
+// Returns a scalar of value 1
+template <typename T>
+T ConstantOne() {
+  return static_cast<T>(1.0);
+}
+template float ConstantOne<float>();
+template double ConstantOne<double>();
+
+// Specialized version of the above for half-precision
+template <>
+half ConstantOne() {
+  return FloatToHalf(1.0f);
+}
+
+// Specialized versions of the above for complex data-types
+template <>
+float2 ConstantOne() {
+  return {1.0f, 0.0f};
+}
+template <>
+double2 ConstantOne() {
+  return {1.0, 0.0};
+}
+
+// =================================================================================================
+
 // Implements the string conversion using std::to_string if possible
 template <typename T>
 std::string ToString(T value) {
@@ -48,6 +98,12 @@ std::string ToString(double2 value) {
   return real.str()+"+"+imag.str()+"i";
 }
 
+// If not possible directly: special case for half-precision
+template <>
+std::string ToString(half value) {
+  return std::to_string(HalfToFloat(value));
+}
+
 // If not possible directly: special cases for CLBlast data-types
 template <>
 std::string ToString(Layout value) {
@@ -105,6 +161,9 @@ template <typename T>
 T ConvertArgument(const char* value) {
   return static_cast<T>(std::stoi(value));
 }
+template <> half ConvertArgument(const char* value) {
+  return FloatToHalf(static_cast<float>(std::stod(value)));
+}
 template <> float ConvertArgument(const char* value) {
   return static_cast<float>(std::stod(value));
 }
@@ -123,7 +182,7 @@ template <> double2 ConvertArgument(const char* value) {
 // This function matches patterns in the form of "-option value" or "--option value". It returns a
 // default value in case the option is not found in the argument string.
 template <typename T>
-T GetArgument(const int argc, char *argv[], std::string &help,
+T GetArgument(const int argc, char **argv, std::string &help,
               const std::string &option, const T default_value) {
 
   // Parses the argument. Note that this supports both the given option (e.g. -device) and one with
@@ -147,6 +206,7 @@ T GetArgument(const int argc, char *argv[], std::string &help,
 // Compiles the above function
 template int GetArgument<int>(const int, char **, std::string&, const std::string&, const int);
 template size_t GetArgument<size_t>(const int, char **, std::string&, const std::string&, const size_t);
+template half GetArgument<half>(const int, char **, std::string&, const std::string&, const half);
 template float GetArgument<float>(const int, char **, std::string&, const std::string&, const float);
 template double GetArgument<double>(const int, char **, std::string&, const std::string&, const double);
 template float2 GetArgument<float2>(const int, char **, std::string&, const std::string&, const float2);
@@ -227,24 +287,49 @@ void PopulateVector(std::vector<double2> &vector) {
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
+// Specialized versions of the above for half-precision
+template <>
+void PopulateVector(std::vector<half> &vector) {
+  const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
+  const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
+  std::mt19937 mt(GetRandomSeed());
+  std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
+  for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
+}
+
 // =================================================================================================
 
-// Returns a scalar with a default value
-template <typename T>
-T GetScalar() {
-  return static_cast<T>(2.0);
+// Conversion between half and single-precision
+std::vector<float> HalfToFloatBuffer(const std::vector<half>& source) {
+  auto result = std::vector<float>(source.size());
+  for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); }
+  return result;
+}
+void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source) {
+  for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); }
 }
-template float GetScalar<float>();
-template double GetScalar<double>();
 
-// Specialized versions of the above for complex data-types
-template <>
-float2 GetScalar() {
-  return {2.0f, 0.5f};
+// As above, but now for OpenCL data-types instead of std::vectors
+Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw) {
+  const auto size = source.GetSize() / sizeof(half);
+  auto queue = Queue(queue_raw);
+  auto context = queue.GetContext();
+  auto source_cpu = std::vector<half>(size);
+  source.Read(queue, size, source_cpu);
+  auto result_cpu = HalfToFloatBuffer(source_cpu);
+  auto result = Buffer<float>(context, size);
+  result.Write(queue, size, result_cpu);
+  return result;
 }
-template <>
-double2 GetScalar() {
-  return {2.0, 0.5};
+void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw) {
+  const auto size = source.GetSize() / sizeof(float);
+  auto queue = Queue(queue_raw);
+  auto context = queue.GetContext();
+  auto source_cpu = std::vector<float>(size);
+  source.Read(queue, size, source_cpu);
+  auto result_cpu = std::vector<half>(size);
+  FloatToHalfBuffer(result_cpu, source_cpu);
+  result.Write(queue, size, result_cpu);
 }
 
 // =================================================================================================
@@ -275,6 +360,13 @@ size_t GetBytes(const Precision precision) {
   }
 }
 
+// Convert the template argument into a precision value
+template <> Precision PrecisionValue<half>() { return Precision::kHalf; }
+template <> Precision PrecisionValue<float>() { return Precision::kSingle; }
+template <> Precision PrecisionValue<double>() { return Precision::kDouble; }
+template <> Precision PrecisionValue<float2>() { return Precision::kComplexSingle; }
+template <> Precision PrecisionValue<double2>() { return Precision::kComplexDouble; }
+
 // =================================================================================================
 
 // Returns false is this precision is not supported by the device
@@ -288,6 +380,11 @@ template <> bool PrecisionSupported<double2>(const Device &device) {
   auto extensions = device.Capabilities();
   return (extensions.find(kKhronosDoublePrecision) == std::string::npos) ? false : true;
 }
+template <> bool PrecisionSupported<half>(const Device &device) {
+  auto extensions = device.Capabilities();
+  if (device.Name() == "Mali-T628") { return true; } // supports fp16 but not cl_khr_fp16 officially
+  return (extensions.find(kKhronosHalfPrecision) == std::string::npos) ? false : true;
+}
 
 // =================================================================================================
 } // namespace clblast
diff --git a/include/internal/utilities.h b/src/utilities.hpp
index 82cd7f44..5a4eef0f 100644
--- a/include/internal/utilities.h
+++ b/src/utilities.hpp
@@ -22,7 +22,8 @@
 #include <complex>
 
 #include "clblast.h"
-#include "internal/clpp11.h"
+#include "clblast_half.h"
+#include "clpp11.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -94,6 +95,16 @@ constexpr auto kArgNoAbbreviations = "no_abbrv";
 
 // =================================================================================================
 
+// Returns a scalar with a default value
+template <typename T>
+T GetScalar();
+
+// Returns a scalar of value 1
+template <typename T>
+T ConstantOne();
+
+// =================================================================================================
+
 // Structure containing all possible arguments for test clients, including their default values
 template <typename T>
 struct Arguments {
@@ -124,8 +135,8 @@ struct Arguments {
   size_t nrm2_offset = 0;
   size_t asum_offset = 0;
   size_t imax_offset = 0;
-  T alpha = T{1.0};
-  T beta = T{1.0};
+  T alpha = ConstantOne<T>();
+  T beta = ConstantOne<T>();
   size_t x_size = 1;
   size_t y_size = 1;
   size_t a_size = 1;
@@ -177,7 +188,7 @@ T ConvertArgument(const char* value);
 
 // Basic argument parser, matching patterns in the form of "-option value" and "--option value"
 template <typename T>
-T GetArgument(const int argc, char *argv[], std::string &help,
+T GetArgument(const int argc, char **argv, std::string &help,
               const std::string &option, const T default_value);
 
 // Returns the precision only
@@ -189,6 +200,11 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
 
 // =================================================================================================
 
+// Helper function to check for errors in the status code
+constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+
+// =================================================================================================
+
 // Returns a random number to be used as a seed
 unsigned int GetRandomSeed();
 
@@ -202,9 +218,13 @@ void PopulateVector(std::vector<T> &vector);
 
 // =================================================================================================
 
-// Returns a scalar with a default value
-template <typename T>
-T GetScalar();
+// Conversion between half and single-precision
+std::vector<float> HalfToFloatBuffer(const std::vector<half>& source);
+void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& source);
+
+// As above, but now for OpenCL data-types instead of std::vectors
+Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
+void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
 
 // =================================================================================================
 
@@ -220,6 +240,10 @@ bool IsMultiple(const size_t a, const size_t b);
 // Convert the precision enum into bytes, e.g. a double takes up 8 bytes
 size_t GetBytes(const Precision precision);
 
+// Convert the template argument into a precision value
+template <typename T>
+Precision PrecisionValue();
+
 // =================================================================================================
 
 // Returns false is this precision is not supported by the device
diff --git a/test/correctness/routines/level1/xamax.cpp b/test/correctness/routines/level1/xamax.cpp
new file mode 100644
index 00000000..607637e8
--- /dev/null
+++ b/test/correctness/routines/level1/xamax.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xamax.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXamax<float>, float, float>(argc, argv, false, "iSAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX");
+  errors += clblast::RunTests<clblast::TestXamax<half>, half, half>(argc, argv, true, "iHAMAX");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xasum.cpp b/test/correctness/routines/level1/xasum.cpp
new file mode 100644
index 00000000..e22e42a6
--- /dev/null
+++ b/test/correctness/routines/level1/xasum.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xasum.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXasum<float>, float, float>(argc, argv, false, "SASUM");
+  errors += clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM");
+  errors += clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM");
+  errors += clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM");
+  errors += clblast::RunTests<clblast::TestXasum<half>, half, half>(argc, argv, true, "HASUM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xaxpy.cpp b/test/correctness/routines/level1/xaxpy.cpp
new file mode 100644
index 00000000..064172fa
--- /dev/null
+++ b/test/correctness/routines/level1/xaxpy.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xaxpy.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXaxpy<float>, float, float>(argc, argv, false, "SAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY");
+  errors += clblast::RunTests<clblast::TestXaxpy<half>, half, half>(argc, argv, true, "HAXPY");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xcopy.cpp b/test/correctness/routines/level1/xcopy.cpp
new file mode 100644
index 00000000..e6f2581b
--- /dev/null
+++ b/test/correctness/routines/level1/xcopy.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xcopy.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXcopy<float>, float, float>(argc, argv, false, "SCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY");
+  errors += clblast::RunTests<clblast::TestXcopy<half>, half, half>(argc, argv, true, "HCOPY");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xaxpy.cc b/test/correctness/routines/level1/xdot.cpp
index 746e0001..080250cb 100644
--- a/test/correctness/routines/level1/xaxpy.cc
+++ b/test/correctness/routines/level1/xdot.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xaxpy.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xdot.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,11 +18,11 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXaxpy<float>, float, float>(argc, argv, false, "SAXPY");
-  clblast::RunTests<clblast::TestXaxpy<double>, double, double>(argc, argv, true, "DAXPY");
-  clblast::RunTests<clblast::TestXaxpy<float2>, float2, float2>(argc, argv, true, "CAXPY");
-  clblast::RunTests<clblast::TestXaxpy<double2>, double2, double2>(argc, argv, true, "ZAXPY");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXdot<float>, float, float>(argc, argv, false, "SDOT");
+  errors += clblast::RunTests<clblast::TestXdot<double>, double, double>(argc, argv, true, "DDOT");
+  errors += clblast::RunTests<clblast::TestXdot<half>, half, half>(argc, argv, true, "HDOT");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level1/xdotc.cpp b/test/correctness/routines/level1/xdotc.cpp
new file mode 100644
index 00000000..2a7bbeca
--- /dev/null
+++ b/test/correctness/routines/level1/xdotc.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xdotc.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXdotc<float2>, float2, float2>(argc, argv, false, "CDOTC");
+  errors += clblast::RunTests<clblast::TestXdotc<double2>, double2, double2>(argc, argv, true, "ZDOTC");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xdotu.cpp b/test/correctness/routines/level1/xdotu.cpp
new file mode 100644
index 00000000..1047d021
--- /dev/null
+++ b/test/correctness/routines/level1/xdotu.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xdotu.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXdotu<float2>, float2, float2>(argc, argv, false, "CDOTU");
+  errors += clblast::RunTests<clblast::TestXdotu<double2>, double2, double2>(argc, argv, true, "ZDOTU");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xnrm2.cc b/test/correctness/routines/level1/xnrm2.cc
deleted file mode 100644
index 97fb0ad6..00000000
--- a/test/correctness/routines/level1/xnrm2.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level1/xnrm2.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXnrm2<float>, float, float>(argc, argv, false, "SNRM2");
-  clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2");
-  clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2");
-  clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xnrm2.cpp b/test/correctness/routines/level1/xnrm2.cpp
new file mode 100644
index 00000000..142fa7ba
--- /dev/null
+++ b/test/correctness/routines/level1/xnrm2.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xnrm2.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXnrm2<float>, float, float>(argc, argv, false, "SNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<double>, double, double>(argc, argv, true, "DNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<float2>, float2, float2>(argc, argv, true, "ScNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<double2>, double2, double2>(argc, argv, true, "DzNRM2");
+  errors += clblast::RunTests<clblast::TestXnrm2<half>, half, half>(argc, argv, true, "HNRM2");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xdotc.cc b/test/correctness/routines/level1/xrot.cpp
index 76aaa0ec..5af358eb 100644
--- a/test/correctness/routines/level1/xdotc.cc
+++ b/test/correctness/routines/level1/xrot.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xdotc.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xrot.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,9 +18,10 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXdotc<float2>, float2, float2>(argc, argv, false, "CDOTC");
-  clblast::RunTests<clblast::TestXdotc<double2>, double2, double2>(argc, argv, true, "ZDOTC");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXrot<float>, float, float>(argc, argv, false, "SROT");
+  errors += clblast::RunTests<clblast::TestXrot<double>, double, double>(argc, argv, true, "DROT");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level1/xrotg.cc b/test/correctness/routines/level1/xrotg.cc
deleted file mode 100644
index dd068992..00000000
--- a/test/correctness/routines/level1/xrotg.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level1/xrotg.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXrotg<float>, float, float>(argc, argv, false, "SROTG");
-  clblast::RunTests<clblast::TestXrotg<double>, double, double>(argc, argv, true, "DROTG");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xdotu.cc b/test/correctness/routines/level1/xrotg.cpp
index aecde4f5..ad23a554 100644
--- a/test/correctness/routines/level1/xdotu.cc
+++ b/test/correctness/routines/level1/xrotg.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xdotu.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xrotg.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,9 +18,10 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXdotu<float2>, float2, float2>(argc, argv, false, "CDOTU");
-  clblast::RunTests<clblast::TestXdotu<double2>, double2, double2>(argc, argv, true, "ZDOTU");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXrotg<float>, float, float>(argc, argv, false, "SROTG");
+  errors += clblast::RunTests<clblast::TestXrotg<double>, double, double>(argc, argv, true, "DROTG");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level1/xrotm.cc b/test/correctness/routines/level1/xrotm.cc
deleted file mode 100644
index 869056ef..00000000
--- a/test/correctness/routines/level1/xrotm.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level1/xrotm.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXrotm<float>, float, float>(argc, argv, false, "SROTM");
-  clblast::RunTests<clblast::TestXrotm<double>, double, double>(argc, argv, true, "DROTM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xrot.cc b/test/correctness/routines/level1/xrotm.cpp
index 4020ff13..4f7e8f15 100644
--- a/test/correctness/routines/level1/xrot.cc
+++ b/test/correctness/routines/level1/xrotm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xrot.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xrotm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,9 +18,10 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXrot<float>, float, float>(argc, argv, false, "SROT");
-  clblast::RunTests<clblast::TestXrot<double>, double, double>(argc, argv, true, "DROT");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXrotm<float>, float, float>(argc, argv, false, "SROTM");
+  errors += clblast::RunTests<clblast::TestXrotm<double>, double, double>(argc, argv, true, "DROTM");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level1/xrotmg.cc b/test/correctness/routines/level1/xrotmg.cc
deleted file mode 100644
index 29f8b0e1..00000000
--- a/test/correctness/routines/level1/xrotmg.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level1/xrotmg.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXrotmg<float>, float, float>(argc, argv, false, "SROTMG");
-  clblast::RunTests<clblast::TestXrotmg<double>, double, double>(argc, argv, true, "DROTMG");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xdot.cc b/test/correctness/routines/level1/xrotmg.cpp
index 5ea105e0..ca89bc12 100644
--- a/test/correctness/routines/level1/xdot.cc
+++ b/test/correctness/routines/level1/xrotmg.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xdot.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xrotmg.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,9 +18,10 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXdot<float>, float, float>(argc, argv, false, "SDOT");
-  clblast::RunTests<clblast::TestXdot<double>, double, double>(argc, argv, true, "DDOT");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXrotmg<float>, float, float>(argc, argv, false, "SROTMG");
+  errors += clblast::RunTests<clblast::TestXrotmg<double>, double, double>(argc, argv, true, "DROTMG");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level1/xscal.cc b/test/correctness/routines/level1/xscal.cc
deleted file mode 100644
index 4d138fad..00000000
--- a/test/correctness/routines/level1/xscal.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level1/xscal.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXscal<float>, float, float>(argc, argv, false, "SSCAL");
-  clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL");
-  clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL");
-  clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xscal.cpp b/test/correctness/routines/level1/xscal.cpp
new file mode 100644
index 00000000..939524be
--- /dev/null
+++ b/test/correctness/routines/level1/xscal.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xscal.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXscal<float>, float, float>(argc, argv, false, "SSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<double>, double, double>(argc, argv, true, "DSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<float2>, float2, float2>(argc, argv, true, "CSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<double2>, double2, double2>(argc, argv, true, "ZSCAL");
+  errors += clblast::RunTests<clblast::TestXscal<half>, half, half>(argc, argv, true, "HSCAL");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level1/xswap.cc b/test/correctness/routines/level1/xswap.cc
deleted file mode 100644
index 38f110f7..00000000
--- a/test/correctness/routines/level1/xswap.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level1/xswap.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXswap<float>, float, float>(argc, argv, false, "SSWAP");
-  clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP");
-  clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP");
-  clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xswap.cpp b/test/correctness/routines/level1/xswap.cpp
new file mode 100644
index 00000000..446f3d65
--- /dev/null
+++ b/test/correctness/routines/level1/xswap.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level1/xswap.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXswap<float>, float, float>(argc, argv, false, "SSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<double>, double, double>(argc, argv, true, "DSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<float2>, float2, float2>(argc, argv, true, "CSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<double2>, double2, double2>(argc, argv, true, "ZSWAP");
+  errors += clblast::RunTests<clblast::TestXswap<half>, half, half>(argc, argv, true, "HSWAP");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xgbmv.cc b/test/correctness/routines/level2/xgbmv.cc
deleted file mode 100644
index b28c5978..00000000
--- a/test/correctness/routines/level2/xgbmv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xgbmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXgbmv<float>, float, float>(argc, argv, false, "SGBMV");
-  clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV");
-  clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV");
-  clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xgbmv.cpp b/test/correctness/routines/level2/xgbmv.cpp
new file mode 100644
index 00000000..8c49bc65
--- /dev/null
+++ b/test/correctness/routines/level2/xgbmv.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xgbmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXgbmv<float>, float, float>(argc, argv, false, "SGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<double>, double, double>(argc, argv, true, "DGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<float2>, float2, float2>(argc, argv, true, "CGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<double2>, double2, double2>(argc, argv, true, "ZGBMV");
+  errors += clblast::RunTests<clblast::TestXgbmv<half>, half, half>(argc, argv, true, "HGBMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xgemv.cc b/test/correctness/routines/level2/xgemv.cc
deleted file mode 100644
index 14eb74d1..00000000
--- a/test/correctness/routines/level2/xgemv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xgemv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXgemv<float>, float, float>(argc, argv, false, "SGEMV");
-  clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV");
-  clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV");
-  clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xgemv.cpp b/test/correctness/routines/level2/xgemv.cpp
new file mode 100644
index 00000000..902ae777
--- /dev/null
+++ b/test/correctness/routines/level2/xgemv.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xgemv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXgemv<float>, float, float>(argc, argv, false, "SGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<double>, double, double>(argc, argv, true, "DGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<float2>, float2, float2>(argc, argv, true, "CGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<double2>, double2, double2>(argc, argv, true, "ZGEMV");
+  errors += clblast::RunTests<clblast::TestXgemv<half>, half, half>(argc, argv, true, "HGEMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xger.cc b/test/correctness/routines/level2/xger.cc
deleted file mode 100644
index c37a5c41..00000000
--- a/test/correctness/routines/level2/xger.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xger.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXger<float>, float, float>(argc, argv, false, "SGER");
-  clblast::RunTests<clblast::TestXger<double>, double, double>(argc, argv, true, "DGER");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xcopy.cc b/test/correctness/routines/level2/xger.cpp
index 3e16ffc6..ce61bbcb 100644
--- a/test/correctness/routines/level1/xcopy.cc
+++ b/test/correctness/routines/level2/xger.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xcopy.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xger.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,11 +18,11 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXcopy<float>, float, float>(argc, argv, false, "SCOPY");
-  clblast::RunTests<clblast::TestXcopy<double>, double, double>(argc, argv, true, "DCOPY");
-  clblast::RunTests<clblast::TestXcopy<float2>, float2, float2>(argc, argv, true, "CCOPY");
-  clblast::RunTests<clblast::TestXcopy<double2>, double2, double2>(argc, argv, true, "ZCOPY");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXger<float>, float, float>(argc, argv, false, "SGER");
+  errors += clblast::RunTests<clblast::TestXger<double>, double, double>(argc, argv, true, "DGER");
+  errors += clblast::RunTests<clblast::TestXger<half>, half, half>(argc, argv, true, "HGER");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level2/xgerc.cc b/test/correctness/routines/level2/xgerc.cc
deleted file mode 100644
index 8fd31142..00000000
--- a/test/correctness/routines/level2/xgerc.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xgerc.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXgerc<float2>, float2, float2>(argc, argv, false, "CGERC");
-  clblast::RunTests<clblast::TestXgerc<double2>, double2, double2>(argc, argv, true, "ZGERC");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xgerc.cpp b/test/correctness/routines/level2/xgerc.cpp
new file mode 100644
index 00000000..b747f20d
--- /dev/null
+++ b/test/correctness/routines/level2/xgerc.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xgerc.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXgerc<float2>, float2, float2>(argc, argv, false, "CGERC");
+  errors += clblast::RunTests<clblast::TestXgerc<double2>, double2, double2>(argc, argv, true, "ZGERC");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xgeru.cc b/test/correctness/routines/level2/xgeru.cc
deleted file mode 100644
index ee92416b..00000000
--- a/test/correctness/routines/level2/xgeru.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xgeru.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXgeru<float2>, float2, float2>(argc, argv, false, "CGERU");
-  clblast::RunTests<clblast::TestXgeru<double2>, double2, double2>(argc, argv, true, "ZGERU");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xgeru.cpp b/test/correctness/routines/level2/xgeru.cpp
new file mode 100644
index 00000000..f80c1e2b
--- /dev/null
+++ b/test/correctness/routines/level2/xgeru.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xgeru.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXgeru<float2>, float2, float2>(argc, argv, false, "CGERU");
+  errors += clblast::RunTests<clblast::TestXgeru<double2>, double2, double2>(argc, argv, true, "ZGERU");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xhbmv.cc b/test/correctness/routines/level2/xhbmv.cc
deleted file mode 100644
index 4cd137a7..00000000
--- a/test/correctness/routines/level2/xhbmv.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xhbmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXhbmv<float2>, float2, float2>(argc, argv, false, "CHBMV");
-  clblast::RunTests<clblast::TestXhbmv<double2>, double2, double2>(argc, argv, true, "ZHBMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xhbmv.cpp b/test/correctness/routines/level2/xhbmv.cpp
new file mode 100644
index 00000000..a4885c01
--- /dev/null
+++ b/test/correctness/routines/level2/xhbmv.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xhbmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXhbmv<float2>, float2, float2>(argc, argv, false, "CHBMV");
+  errors += clblast::RunTests<clblast::TestXhbmv<double2>, double2, double2>(argc, argv, true, "ZHBMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xhemv.cc b/test/correctness/routines/level2/xhemv.cc
deleted file mode 100644
index 20c5370c..00000000
--- a/test/correctness/routines/level2/xhemv.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xhemv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXhemv<float2>, float2, float2>(argc, argv, false, "CHEMV");
-  clblast::RunTests<clblast::TestXhemv<double2>, double2, double2>(argc, argv, true, "ZHEMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xhemv.cpp b/test/correctness/routines/level2/xhemv.cpp
new file mode 100644
index 00000000..4318ffee
--- /dev/null
+++ b/test/correctness/routines/level2/xhemv.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xhemv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXhemv<float2>, float2, float2>(argc, argv, false, "CHEMV");
+  errors += clblast::RunTests<clblast::TestXhemv<double2>, double2, double2>(argc, argv, true, "ZHEMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xher.cc b/test/correctness/routines/level2/xher.cc
deleted file mode 100644
index 5b9b48be..00000000
--- a/test/correctness/routines/level2/xher.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xher.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXher<float2,float>, float2, float>(argc, argv, false, "CHER");
-  clblast::RunTests<clblast::TestXher<double2,double>, double2, double>(argc, argv, true, "ZHER");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xher.cpp b/test/correctness/routines/level2/xher.cpp
new file mode 100644
index 00000000..fe37bd76
--- /dev/null
+++ b/test/correctness/routines/level2/xher.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xher.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXher<float2,float>, float2, float>(argc, argv, false, "CHER");
+  errors += clblast::RunTests<clblast::TestXher<double2,double>, double2, double>(argc, argv, true, "ZHER");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xher2.cc b/test/correctness/routines/level2/xher2.cc
deleted file mode 100644
index 093b3959..00000000
--- a/test/correctness/routines/level2/xher2.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xher2.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXher2<float2>, float2, float2>(argc, argv, false, "CHER2");
-  clblast::RunTests<clblast::TestXher2<double2>, double2, double2>(argc, argv, true, "ZHER2");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xher2.cpp b/test/correctness/routines/level2/xher2.cpp
new file mode 100644
index 00000000..0b4af4d0
--- /dev/null
+++ b/test/correctness/routines/level2/xher2.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xher2.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXher2<float2>, float2, float2>(argc, argv, false, "CHER2");
+  errors += clblast::RunTests<clblast::TestXher2<double2>, double2, double2>(argc, argv, true, "ZHER2");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xhpmv.cc b/test/correctness/routines/level2/xhpmv.cc
deleted file mode 100644
index cbf41443..00000000
--- a/test/correctness/routines/level2/xhpmv.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xhpmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXhpmv<float2>, float2, float2>(argc, argv, false, "CHPMV");
-  clblast::RunTests<clblast::TestXhpmv<double2>, double2, double2>(argc, argv, true, "ZHPMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xhpmv.cpp b/test/correctness/routines/level2/xhpmv.cpp
new file mode 100644
index 00000000..dd77df71
--- /dev/null
+++ b/test/correctness/routines/level2/xhpmv.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xhpmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXhpmv<float2>, float2, float2>(argc, argv, false, "CHPMV");
+  errors += clblast::RunTests<clblast::TestXhpmv<double2>, double2, double2>(argc, argv, true, "ZHPMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xhpr.cc b/test/correctness/routines/level2/xhpr.cc
deleted file mode 100644
index a720aaef..00000000
--- a/test/correctness/routines/level2/xhpr.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xhpr.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXhpr<float2,float>, float2, float>(argc, argv, false, "CHPR");
-  clblast::RunTests<clblast::TestXhpr<double2,double>, double2, double>(argc, argv, true, "ZHPR");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xhpr.cpp b/test/correctness/routines/level2/xhpr.cpp
new file mode 100644
index 00000000..5a3f615f
--- /dev/null
+++ b/test/correctness/routines/level2/xhpr.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xhpr.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXhpr<float2,float>, float2, float>(argc, argv, false, "CHPR");
+  errors += clblast::RunTests<clblast::TestXhpr<double2,double>, double2, double>(argc, argv, true, "ZHPR");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xhpr2.cc b/test/correctness/routines/level2/xhpr2.cc
deleted file mode 100644
index 0fed97e1..00000000
--- a/test/correctness/routines/level2/xhpr2.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xhpr2.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXhpr2<float2>, float2, float2>(argc, argv, false, "CHPR2");
-  clblast::RunTests<clblast::TestXhpr2<double2>, double2, double2>(argc, argv, true, "ZHPR2");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xhpr2.cpp b/test/correctness/routines/level2/xhpr2.cpp
new file mode 100644
index 00000000..8218b444
--- /dev/null
+++ b/test/correctness/routines/level2/xhpr2.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xhpr2.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXhpr2<float2>, float2, float2>(argc, argv, false, "CHPR2");
+  errors += clblast::RunTests<clblast::TestXhpr2<double2>, double2, double2>(argc, argv, true, "ZHPR2");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xsbmv.cc b/test/correctness/routines/level2/xsbmv.cc
deleted file mode 100644
index 212e2c3a..00000000
--- a/test/correctness/routines/level2/xsbmv.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xsbmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsbmv<float>, float, float>(argc, argv, false, "SSBMV");
-  clblast::RunTests<clblast::TestXsbmv<double>, double, double>(argc, argv, true, "DSBMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xamax.cc b/test/correctness/routines/level2/xsbmv.cpp
index ade09e7a..7918cb21 100644
--- a/test/correctness/routines/level1/xamax.cc
+++ b/test/correctness/routines/level2/xsbmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xamax.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xsbmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,11 +18,11 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXamax<float>, float, float>(argc, argv, false, "iSAMAX");
-  clblast::RunTests<clblast::TestXamax<double>, double, double>(argc, argv, true, "iDAMAX");
-  clblast::RunTests<clblast::TestXamax<float2>, float2, float2>(argc, argv, true, "iCAMAX");
-  clblast::RunTests<clblast::TestXamax<double2>, double2, double2>(argc, argv, true, "iZAMAX");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsbmv<float>, float, float>(argc, argv, false, "SSBMV");
+  errors += clblast::RunTests<clblast::TestXsbmv<double>, double, double>(argc, argv, true, "DSBMV");
+  errors += clblast::RunTests<clblast::TestXsbmv<half>, half, half>(argc, argv, true, "HSBMV");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level2/xspmv.cc b/test/correctness/routines/level2/xspmv.cc
deleted file mode 100644
index dc833024..00000000
--- a/test/correctness/routines/level2/xspmv.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xspmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXspmv<float>, float, float>(argc, argv, false, "SSPMV");
-  clblast::RunTests<clblast::TestXspmv<double>, double, double>(argc, argv, true, "DSPMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xspmv.cpp b/test/correctness/routines/level2/xspmv.cpp
new file mode 100644
index 00000000..78210660
--- /dev/null
+++ b/test/correctness/routines/level2/xspmv.cpp
@@ -0,0 +1,28 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xspmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXspmv<float>, float, float>(argc, argv, false, "SSPMV");
+  errors += clblast::RunTests<clblast::TestXspmv<double>, double, double>(argc, argv, true, "DSPMV");
+  errors += clblast::RunTests<clblast::TestXspmv<half>, half, half>(argc, argv, true, "HSPMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xspr.cc b/test/correctness/routines/level2/xspr.cc
deleted file mode 100644
index a0104dd4..00000000
--- a/test/correctness/routines/level2/xspr.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xspr.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXspr<float>, float, float>(argc, argv, false, "SSPR");
-  clblast::RunTests<clblast::TestXspr<double>, double, double>(argc, argv, true, "DSPR");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level1/xasum.cc b/test/correctness/routines/level2/xspr.cpp
index 5ec20596..d05adf34 100644
--- a/test/correctness/routines/level1/xasum.cc
+++ b/test/correctness/routines/level2/xspr.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level1/xasum.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xspr.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,11 +18,11 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXasum<float>, float, float>(argc, argv, false, "SASUM");
-  clblast::RunTests<clblast::TestXasum<double>, double, double>(argc, argv, true, "DASUM");
-  clblast::RunTests<clblast::TestXasum<float2>, float2, float2>(argc, argv, true, "ScASUM");
-  clblast::RunTests<clblast::TestXasum<double2>, double2, double2>(argc, argv, true, "DzASUM");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXspr<float>, float, float>(argc, argv, false, "SSPR");
+  errors += clblast::RunTests<clblast::TestXspr<double>, double, double>(argc, argv, true, "DSPR");
+  errors += clblast::RunTests<clblast::TestXspr<half>, half, half>(argc, argv, true, "HSPR");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/routines/level2/xspr2.cc b/test/correctness/routines/level2/xspr2.cc
deleted file mode 100644
index 5fe5827f..00000000
--- a/test/correctness/routines/level2/xspr2.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xspr2.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXspr2<float>, float, float>(argc, argv, false, "SSPR2");
-  clblast::RunTests<clblast::TestXspr2<double>, double, double>(argc, argv, true, "DSPR2");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xspr2.cpp b/test/correctness/routines/level2/xspr2.cpp
new file mode 100644
index 00000000..caa46a09
--- /dev/null
+++ b/test/correctness/routines/level2/xspr2.cpp
@@ -0,0 +1,28 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xspr2.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXspr2<float>, float, float>(argc, argv, false, "SSPR2");
+  errors += clblast::RunTests<clblast::TestXspr2<double>, double, double>(argc, argv, true, "DSPR2");
+  errors += clblast::RunTests<clblast::TestXspr2<half>, half, half>(argc, argv, true, "HSPR2");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xsymv.cc b/test/correctness/routines/level2/xsymv.cc
deleted file mode 100644
index 6224739f..00000000
--- a/test/correctness/routines/level2/xsymv.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xsymv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsymv<float>, float, float>(argc, argv, false, "SSYMV");
-  clblast::RunTests<clblast::TestXsymv<double>, double, double>(argc, argv, true, "DSYMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xsymv.cpp b/test/correctness/routines/level2/xsymv.cpp
new file mode 100644
index 00000000..978a5f8a
--- /dev/null
+++ b/test/correctness/routines/level2/xsymv.cpp
@@ -0,0 +1,28 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xsymv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsymv<float>, float, float>(argc, argv, false, "SSYMV");
+  errors += clblast::RunTests<clblast::TestXsymv<double>, double, double>(argc, argv, true, "DSYMV");
+  errors += clblast::RunTests<clblast::TestXsymv<half>, half, half>(argc, argv, true, "HSYMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xsyr.cc b/test/correctness/routines/level2/xsyr.cc
deleted file mode 100644
index a47b918f..00000000
--- a/test/correctness/routines/level2/xsyr.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xsyr.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsyr<float>, float, float>(argc, argv, false, "SSYR");
-  clblast::RunTests<clblast::TestXsyr<double>, double, double>(argc, argv, true, "DSYR");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xsyr.cpp b/test/correctness/routines/level2/xsyr.cpp
new file mode 100644
index 00000000..244dbfb4
--- /dev/null
+++ b/test/correctness/routines/level2/xsyr.cpp
@@ -0,0 +1,28 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xsyr.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsyr<float>, float, float>(argc, argv, false, "SSYR");
+  errors += clblast::RunTests<clblast::TestXsyr<double>, double, double>(argc, argv, true, "DSYR");
+  errors += clblast::RunTests<clblast::TestXsyr<half>, half, half>(argc, argv, true, "HSYR");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xsyr2.cc b/test/correctness/routines/level2/xsyr2.cc
deleted file mode 100644
index 1743632c..00000000
--- a/test/correctness/routines/level2/xsyr2.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xsyr2.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsyr2<float>, float, float>(argc, argv, false, "SSYR2");
-  clblast::RunTests<clblast::TestXsyr2<double>, double, double>(argc, argv, true, "DSYR2");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xsyr2.cpp b/test/correctness/routines/level2/xsyr2.cpp
new file mode 100644
index 00000000..422e67ad
--- /dev/null
+++ b/test/correctness/routines/level2/xsyr2.cpp
@@ -0,0 +1,28 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xsyr2.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsyr2<float>, float, float>(argc, argv, false, "SSYR2");
+  errors += clblast::RunTests<clblast::TestXsyr2<double>, double, double>(argc, argv, true, "DSYR2");
+  errors += clblast::RunTests<clblast::TestXsyr2<half>, half, half>(argc, argv, true, "HSYR2");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xtbmv.cc b/test/correctness/routines/level2/xtbmv.cc
deleted file mode 100644
index d3bbbade..00000000
--- a/test/correctness/routines/level2/xtbmv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xtbmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtbmv<float>, float, float>(argc, argv, false, "STBMV");
-  clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV");
-  clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV");
-  clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xtbmv.cpp b/test/correctness/routines/level2/xtbmv.cpp
new file mode 100644
index 00000000..491708ec
--- /dev/null
+++ b/test/correctness/routines/level2/xtbmv.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xtbmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtbmv<float>, float, float>(argc, argv, false, "STBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<double>, double, double>(argc, argv, true, "DTBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<float2>, float2, float2>(argc, argv, true, "CTBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<double2>, double2, double2>(argc, argv, true, "ZTBMV");
+  errors += clblast::RunTests<clblast::TestXtbmv<half>, half, half>(argc, argv, true, "HTBMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xtbsv.cc b/test/correctness/routines/level2/xtbsv.cc
deleted file mode 100644
index c8a8a583..00000000
--- a/test/correctness/routines/level2/xtbsv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xtbsv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtbsv<float>, float, float>(argc, argv, false, "STBSV");
-  clblast::RunTests<clblast::TestXtbsv<double>, double, double>(argc, argv, true, "DTBSV");
-  clblast::RunTests<clblast::TestXtbsv<float2>, float2, float2>(argc, argv, true, "CTBSV");
-  clblast::RunTests<clblast::TestXtbsv<double2>, double2, double2>(argc, argv, true, "ZTBSV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xtbsv.cpp b/test/correctness/routines/level2/xtbsv.cpp
new file mode 100644
index 00000000..12b5dca5
--- /dev/null
+++ b/test/correctness/routines/level2/xtbsv.cpp
@@ -0,0 +1,29 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xtbsv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtbsv<float>, float, float>(argc, argv, false, "STBSV");
+  errors += clblast::RunTests<clblast::TestXtbsv<double>, double, double>(argc, argv, true, "DTBSV");
+  errors += clblast::RunTests<clblast::TestXtbsv<float2>, float2, float2>(argc, argv, true, "CTBSV");
+  errors += clblast::RunTests<clblast::TestXtbsv<double2>, double2, double2>(argc, argv, true, "ZTBSV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xtpmv.cc b/test/correctness/routines/level2/xtpmv.cc
deleted file mode 100644
index 95489a65..00000000
--- a/test/correctness/routines/level2/xtpmv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xtpmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtpmv<float>, float, float>(argc, argv, false, "STPMV");
-  clblast::RunTests<clblast::TestXtpmv<double>, double, double>(argc, argv, true, "DTPMV");
-  clblast::RunTests<clblast::TestXtpmv<float2>, float2, float2>(argc, argv, true, "CTPMV");
-  clblast::RunTests<clblast::TestXtpmv<double2>, double2, double2>(argc, argv, true, "ZTPMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xtpmv.cpp b/test/correctness/routines/level2/xtpmv.cpp
new file mode 100644
index 00000000..b89f0adc
--- /dev/null
+++ b/test/correctness/routines/level2/xtpmv.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xtpmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtpmv<float>, float, float>(argc, argv, false, "STPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<double>, double, double>(argc, argv, true, "DTPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<float2>, float2, float2>(argc, argv, true, "CTPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<double2>, double2, double2>(argc, argv, true, "ZTPMV");
+  errors += clblast::RunTests<clblast::TestXtpmv<half>, half, half>(argc, argv, true, "HTPMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xtpsv.cc b/test/correctness/routines/level2/xtpsv.cc
deleted file mode 100644
index 97d27271..00000000
--- a/test/correctness/routines/level2/xtpsv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xtpsv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtpsv<float>, float, float>(argc, argv, false, "STPSV");
-  clblast::RunTests<clblast::TestXtpsv<double>, double, double>(argc, argv, true, "DTPSV");
-  clblast::RunTests<clblast::TestXtpsv<float2>, float2, float2>(argc, argv, true, "CTPSV");
-  clblast::RunTests<clblast::TestXtpsv<double2>, double2, double2>(argc, argv, true, "ZTPSV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xtpsv.cpp b/test/correctness/routines/level2/xtpsv.cpp
new file mode 100644
index 00000000..6e6e7c85
--- /dev/null
+++ b/test/correctness/routines/level2/xtpsv.cpp
@@ -0,0 +1,29 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xtpsv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtpsv<float>, float, float>(argc, argv, false, "STPSV");
+  errors += clblast::RunTests<clblast::TestXtpsv<double>, double, double>(argc, argv, true, "DTPSV");
+  errors += clblast::RunTests<clblast::TestXtpsv<float2>, float2, float2>(argc, argv, true, "CTPSV");
+  errors += clblast::RunTests<clblast::TestXtpsv<double2>, double2, double2>(argc, argv, true, "ZTPSV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xtrmv.cc b/test/correctness/routines/level2/xtrmv.cc
deleted file mode 100644
index ca50af88..00000000
--- a/test/correctness/routines/level2/xtrmv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xtrmv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtrmv<float>, float, float>(argc, argv, false, "STRMV");
-  clblast::RunTests<clblast::TestXtrmv<double>, double, double>(argc, argv, true, "DTRMV");
-  clblast::RunTests<clblast::TestXtrmv<float2>, float2, float2>(argc, argv, true, "CTRMV");
-  clblast::RunTests<clblast::TestXtrmv<double2>, double2, double2>(argc, argv, true, "ZTRMV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xtrmv.cpp b/test/correctness/routines/level2/xtrmv.cpp
new file mode 100644
index 00000000..819f5cad
--- /dev/null
+++ b/test/correctness/routines/level2/xtrmv.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xtrmv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtrmv<float>, float, float>(argc, argv, false, "STRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<double>, double, double>(argc, argv, true, "DTRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<float2>, float2, float2>(argc, argv, true, "CTRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<double2>, double2, double2>(argc, argv, true, "ZTRMV");
+  errors += clblast::RunTests<clblast::TestXtrmv<half>, half, half>(argc, argv, true, "HTRMV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level2/xtrsv.cc b/test/correctness/routines/level2/xtrsv.cc
deleted file mode 100644
index bfca0f20..00000000
--- a/test/correctness/routines/level2/xtrsv.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level2/xtrsv.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtrsv<float>, float, float>(argc, argv, false, "STRSV");
-  clblast::RunTests<clblast::TestXtrsv<double>, double, double>(argc, argv, true, "DTRSV");
-  clblast::RunTests<clblast::TestXtrsv<float2>, float2, float2>(argc, argv, true, "CTRSV");
-  clblast::RunTests<clblast::TestXtrsv<double2>, double2, double2>(argc, argv, true, "ZTRSV");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level2/xtrsv.cpp b/test/correctness/routines/level2/xtrsv.cpp
new file mode 100644
index 00000000..78e33807
--- /dev/null
+++ b/test/correctness/routines/level2/xtrsv.cpp
@@ -0,0 +1,29 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level2/xtrsv.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtrsv<float>, float, float>(argc, argv, false, "STRSV");
+  errors += clblast::RunTests<clblast::TestXtrsv<double>, double, double>(argc, argv, true, "DTRSV");
+  errors += clblast::RunTests<clblast::TestXtrsv<float2>, float2, float2>(argc, argv, true, "CTRSV");
+  errors += clblast::RunTests<clblast::TestXtrsv<double2>, double2, double2>(argc, argv, true, "ZTRSV");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xgemm.cc b/test/correctness/routines/level3/xgemm.cc
deleted file mode 100644
index 632724ed..00000000
--- a/test/correctness/routines/level3/xgemm.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xgemm.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXgemm<float>, float, float>(argc, argv, false, "SGEMM");
-  clblast::RunTests<clblast::TestXgemm<double>, double, double>(argc, argv, true, "DGEMM");
-  clblast::RunTests<clblast::TestXgemm<float2>, float2, float2>(argc, argv, true, "CGEMM");
-  clblast::RunTests<clblast::TestXgemm<double2>, double2, double2>(argc, argv, true, "ZGEMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xgemm.cpp b/test/correctness/routines/level3/xgemm.cpp
new file mode 100644
index 00000000..54d41719
--- /dev/null
+++ b/test/correctness/routines/level3/xgemm.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xgemm.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXgemm<float>, float, float>(argc, argv, false, "SGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<double>, double, double>(argc, argv, true, "DGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<float2>, float2, float2>(argc, argv, true, "CGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<double2>, double2, double2>(argc, argv, true, "ZGEMM");
+  errors += clblast::RunTests<clblast::TestXgemm<half>, half, half>(argc, argv, true, "HGEMM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xhemm.cc b/test/correctness/routines/level3/xhemm.cc
deleted file mode 100644
index 74e22080..00000000
--- a/test/correctness/routines/level3/xhemm.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xhemm.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXhemm<float2>, float2, float2>(argc, argv, false, "CHEMM");
-  clblast::RunTests<clblast::TestXhemm<double2>, double2, double2>(argc, argv, true, "ZHEMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xhemm.cpp b/test/correctness/routines/level3/xhemm.cpp
new file mode 100644
index 00000000..76c970a7
--- /dev/null
+++ b/test/correctness/routines/level3/xhemm.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xhemm.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXhemm<float2>, float2, float2>(argc, argv, false, "CHEMM");
+  errors += clblast::RunTests<clblast::TestXhemm<double2>, double2, double2>(argc, argv, true, "ZHEMM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xher2k.cpp b/test/correctness/routines/level3/xher2k.cpp
new file mode 100644
index 00000000..c653265e
--- /dev/null
+++ b/test/correctness/routines/level3/xher2k.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xher2k.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXher2k<float2,float>, float2, float>(argc, argv, false, "CHER2K");
+  errors += clblast::RunTests<clblast::TestXher2k<double2,double>, double2, double>(argc, argv, true, "ZHER2K");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xherk.cc b/test/correctness/routines/level3/xherk.cc
deleted file mode 100644
index 32a39bfc..00000000
--- a/test/correctness/routines/level3/xherk.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xherk.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXherk<float2,float>, float2, float>(argc, argv, false, "CHERK");
-  clblast::RunTests<clblast::TestXherk<double2,double>, double2, double>(argc, argv, true, "ZHERK");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xherk.cpp b/test/correctness/routines/level3/xherk.cpp
new file mode 100644
index 00000000..09ea9e4d
--- /dev/null
+++ b/test/correctness/routines/level3/xherk.cpp
@@ -0,0 +1,27 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xherk.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXherk<float2,float>, float2, float>(argc, argv, false, "CHERK");
+  errors += clblast::RunTests<clblast::TestXherk<double2,double>, double2, double>(argc, argv, true, "ZHERK");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsymm.cc b/test/correctness/routines/level3/xsymm.cc
deleted file mode 100644
index 046fca16..00000000
--- a/test/correctness/routines/level3/xsymm.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xsymm.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsymm<float>, float, float>(argc, argv, false, "SSYMM");
-  clblast::RunTests<clblast::TestXsymm<double>, double, double>(argc, argv, true, "DSYMM");
-  clblast::RunTests<clblast::TestXsymm<float2>, float2, float2>(argc, argv, true, "CSYMM");
-  clblast::RunTests<clblast::TestXsymm<double2>, double2, double2>(argc, argv, true, "ZSYMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xsymm.cpp b/test/correctness/routines/level3/xsymm.cpp
new file mode 100644
index 00000000..3cb3515a
--- /dev/null
+++ b/test/correctness/routines/level3/xsymm.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xsymm.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsymm<float>, float, float>(argc, argv, false, "SSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<double>, double, double>(argc, argv, true, "DSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<float2>, float2, float2>(argc, argv, true, "CSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<double2>, double2, double2>(argc, argv, true, "ZSYMM");
+  errors += clblast::RunTests<clblast::TestXsymm<half>, half, half>(argc, argv, true, "HSYMM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyr2k.cc b/test/correctness/routines/level3/xsyr2k.cc
deleted file mode 100644
index db2b83d9..00000000
--- a/test/correctness/routines/level3/xsyr2k.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xsyr2k.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsyr2k<float>, float, float>(argc, argv, false, "SSYR2K");
-  clblast::RunTests<clblast::TestXsyr2k<double>, double, double>(argc, argv, true, "DSYR2K");
-  clblast::RunTests<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv, true, "CSYR2K");
-  clblast::RunTests<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv, true, "ZSYR2K");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyr2k.cpp b/test/correctness/routines/level3/xsyr2k.cpp
new file mode 100644
index 00000000..617af04d
--- /dev/null
+++ b/test/correctness/routines/level3/xsyr2k.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xsyr2k.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsyr2k<float>, float, float>(argc, argv, false, "SSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<double>, double, double>(argc, argv, true, "DSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<float2>, float2, float2>(argc, argv, true, "CSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<double2>, double2, double2>(argc, argv, true, "ZSYR2K");
+  errors += clblast::RunTests<clblast::TestXsyr2k<half>, half, half>(argc, argv, true, "HSYR2K");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyrk.cc b/test/correctness/routines/level3/xsyrk.cc
deleted file mode 100644
index 3dad3535..00000000
--- a/test/correctness/routines/level3/xsyrk.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xsyrk.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXsyrk<float>, float, float>(argc, argv, false, "SSYRK");
-  clblast::RunTests<clblast::TestXsyrk<double>, double, double>(argc, argv, true, "DSYRK");
-  clblast::RunTests<clblast::TestXsyrk<float2>, float2, float2>(argc, argv, true, "CSYRK");
-  clblast::RunTests<clblast::TestXsyrk<double2>, double2, double2>(argc, argv, true, "ZSYRK");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xsyrk.cpp b/test/correctness/routines/level3/xsyrk.cpp
new file mode 100644
index 00000000..2014b8d0
--- /dev/null
+++ b/test/correctness/routines/level3/xsyrk.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xsyrk.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXsyrk<float>, float, float>(argc, argv, false, "SSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<double>, double, double>(argc, argv, true, "DSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<float2>, float2, float2>(argc, argv, true, "CSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<double2>, double2, double2>(argc, argv, true, "ZSYRK");
+  errors += clblast::RunTests<clblast::TestXsyrk<half>, half, half>(argc, argv, true, "HSYRK");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xtrmm.cc b/test/correctness/routines/level3/xtrmm.cc
deleted file mode 100644
index 2d843e3e..00000000
--- a/test/correctness/routines/level3/xtrmm.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xtrmm.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtrmm<float>, float, float>(argc, argv, false, "STRMM");
-  clblast::RunTests<clblast::TestXtrmm<double>, double, double>(argc, argv, true, "DTRMM");
-  clblast::RunTests<clblast::TestXtrmm<float2>, float2, float2>(argc, argv, true, "CTRMM");
-  clblast::RunTests<clblast::TestXtrmm<double2>, double2, double2>(argc, argv, true, "ZTRMM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xtrmm.cpp b/test/correctness/routines/level3/xtrmm.cpp
new file mode 100644
index 00000000..32640d52
--- /dev/null
+++ b/test/correctness/routines/level3/xtrmm.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xtrmm.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtrmm<float>, float, float>(argc, argv, false, "STRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<double>, double, double>(argc, argv, true, "DTRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<float2>, float2, float2>(argc, argv, true, "CTRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<double2>, double2, double2>(argc, argv, true, "ZTRMM");
+  errors += clblast::RunTests<clblast::TestXtrmm<half>, half, half>(argc, argv, true, "HTRMM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xtrsm.cc b/test/correctness/routines/level3/xtrsm.cc
deleted file mode 100644
index b5f5045e..00000000
--- a/test/correctness/routines/level3/xtrsm.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-
-#include "correctness/testblas.h"
-#include "routines/level3/xtrsm.h"
-
-// Shortcuts to the clblast namespace
-using float2 = clblast::float2;
-using double2 = clblast::double2;
-
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXtrsm<float>, float, float>(argc, argv, false, "STRSM");
-  clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM");
-  clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM");
-  clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM");
-  return 0;
-}
-
-// =================================================================================================
diff --git a/test/correctness/routines/level3/xtrsm.cpp b/test/correctness/routines/level3/xtrsm.cpp
new file mode 100644
index 00000000..6119bd17
--- /dev/null
+++ b/test/correctness/routines/level3/xtrsm.cpp
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/correctness/testblas.hpp"
+#include "test/routines/level3/xtrsm.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXtrsm<float>, float, float>(argc, argv, false, "STRSM");
+  errors += clblast::RunTests<clblast::TestXtrsm<double>, double, double>(argc, argv, true, "DTRSM");
+  errors += clblast::RunTests<clblast::TestXtrsm<float2>, float2, float2>(argc, argv, true, "CTRSM");
+  errors += clblast::RunTests<clblast::TestXtrsm<double2>, double2, double2>(argc, argv, true, "ZTRSM");
+  errors += clblast::RunTests<clblast::TestXtrsm<half>, half, half>(argc, argv, true, "HTRSM");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/routines/level3/xher2k.cc b/test/correctness/routines/levelx/xomatcopy.cpp
index 6377572a..e034bc18 100644
--- a/test/correctness/routines/level3/xher2k.cc
+++ b/test/correctness/routines/levelx/xomatcopy.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "correctness/testblas.h"
-#include "routines/level3/xher2k.h"
+#include "test/correctness/testblas.hpp"
+#include "test/routines/levelx/xomatcopy.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -18,9 +18,13 @@ using double2 = clblast::double2;
 
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
-  clblast::RunTests<clblast::TestXher2k<float2,float>, float2, float>(argc, argv, false, "CHER2K");
-  clblast::RunTests<clblast::TestXher2k<double2,double>, double2, double>(argc, argv, true, "ZHER2K");
-  return 0;
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXomatcopy<float>, float, float>(argc, argv, false, "SOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<double>, double, double>(argc, argv, true, "DOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv, true, "COMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv, true, "ZOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<half>, half, half>(argc, argv, true, "HOMATCOPY");
+  if (errors > 0) { return 1; } else { return 0; }
 }
 
 // =================================================================================================
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cpp
index e70c0361..2e751255 100644
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cpp
@@ -14,12 +14,13 @@
 #include <algorithm>
 #include <iostream>
 
-#include "correctness/testblas.h"
+#include "test/correctness/testblas.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // The transpose-options to test with (data-type dependent)
+template <> const std::vector<Transpose> TestBlas<half,half>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<float,float>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<double,double>::kTransposes = {Transpose::kNo, Transpose::kYes};
 template <> const std::vector<Transpose> TestBlas<float2,float2>::kTransposes = {Transpose::kNo, Transpose::kYes, Transpose::kConjugate};
@@ -50,12 +51,12 @@ TestBlas<T,U>::TestBlas(int argc, char *argv[], const bool silent,
   else { throw std::runtime_error("Invalid configuration: no reference to test against"); }
 
   // Computes the maximum sizes. This allows for a single set of input/output buffers.
-  auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
-  auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
-  auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
-  auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
-  auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
+  const auto max_vec = *std::max_element(kVectorDims.begin(), kVectorDims.end());
+  const auto max_inc = *std::max_element(kIncrements.begin(), kIncrements.end());
+  const auto max_mat = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  const auto max_ld = *std::max_element(kMatrixDims.begin(), kMatrixDims.end());
+  const auto max_matvec = *std::max_element(kMatrixVectorDims.begin(), kMatrixVectorDims.end());
+  const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
 
   // Creates test input data
   x_source_.resize(std::max(max_vec, max_matvec)*max_inc + max_offset);
@@ -83,9 +84,15 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
   TestStart("regular behaviour", name);
 
   // Iterates over all the to-be-tested combinations of arguments
-  for (auto &args: test_vector) {
+  for (const auto &args: test_vector) {
 
-    // Runs the CLBlast code
+    // Prints the current test configuration
+    if (verbose_) {
+      fprintf(stdout, "   Testing: %s", GetOptionsString(args).c_str());
+      std::cout << std::flush;
+    }
+
+    // Set-up for the CLBlast run
     auto x_vec2 = Buffer<T>(context_, args.x_size);
     auto y_vec2 = Buffer<T>(context_, args.y_size);
     auto a_mat2 = Buffer<T>(context_, args.a_size);
@@ -101,15 +108,22 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     ap_mat2.Write(queue_, args.ap_size, ap_source_);
     scalar2.Write(queue_, args.scalar_size, scalar_source_);
     auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
-    auto status2 = run_routine_(args, buffers2, queue_);
+
+    // Runs CLBlast
+    if (verbose_) {
+      fprintf(stdout, "[CLBlast]");
+      std::cout << std::flush;
+    }
+    const auto status2 = run_routine_(args, buffers2, queue_);
 
     // Don't continue with CBLAS if there are incorrect parameters
     if (compare_cblas_ && status2 != StatusCode::kSuccess) {
+      if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
       TestErrorCodes(status2, status2, args);
       continue;
     }
 
-    // Runs the reference BLAS code
+    // Set-up for the reference run
     auto x_vec1 = Buffer<T>(context_, args.x_size);
     auto y_vec1 = Buffer<T>(context_, args.y_size);
     auto a_mat1 = Buffer<T>(context_, args.a_size);
@@ -125,9 +139,17 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
     ap_mat1.Write(queue_, args.ap_size, ap_source_);
     scalar1.Write(queue_, args.scalar_size, scalar_source_);
     auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
-    auto status1 = run_reference_(args, buffers1, queue_);
+
+    // Runs the reference code
+    if (verbose_) {
+      if (compare_clblas_) { fprintf(stdout, " [clBLAS]"); }
+      else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); }
+      std::cout << std::flush;
+    }
+    const auto status1 = run_reference_(args, buffers1, queue_);
 
     // Tests for equality of the two status codes
+    if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
     if (status1 != StatusCode::kSuccess || status2 != StatusCode::kSuccess) {
       TestErrorCodes(status1, status2, args);
       continue;
@@ -147,10 +169,8 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
           if (verbose_) {
             if (get_id2_(args) == 1) { fprintf(stdout, "\n   Error at index %zu: ", id1); }
             else { fprintf(stdout, "\n   Error at %zu,%zu: ", id1, id2); }
-            std::cout << result1[index];
-            fprintf(stdout, " (reference) versus ");
-            std::cout << result2[index];
-            fprintf(stdout, " (CLBlast)");
+            fprintf(stdout, " %s (reference) versus ", ToString(result1[index]).c_str());
+            fprintf(stdout, " %s (CLBlast)", ToString(result2[index]).c_str());
           }
         }
       }
@@ -170,11 +190,18 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
 template <typename T, typename U>
 void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
   if (!PrecisionSupported<T>(device_)) { return; }
-  if (!compare_clblas_) { return; }
+  if (!compare_clblas_) { return; } // not supported for CPU BLAS routines
+  if (std::is_same<T, half>::value) { return; } // not supported for half-precision
   TestStart("invalid buffer sizes", name);
 
   // Iterates over all the to-be-tested combinations of arguments
-  for (auto &args: test_vector) {
+  for (const auto &args: test_vector) {
+
+    // Prints the current test configuration
+    if (verbose_) {
+      fprintf(stdout, "   Testing: %s", GetSizesString(args).c_str());
+      std::cout << std::flush;
+    }
 
     // Creates the OpenCL buffers. Note: we are not using the C++ version since we explicitly
     // want to be able to create invalid buffers (no error checking here).
@@ -206,14 +233,26 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
     auto c_mat2 = Buffer<T>(c2);
     auto ap_mat2 = Buffer<T>(ap2);
     auto scalar2 = Buffer<T>(d2);
-
-    // Runs the two routines
     auto buffers1 = Buffers<T>{x_vec1, y_vec1, a_mat1, b_mat1, c_mat1, ap_mat1, scalar1};
     auto buffers2 = Buffers<T>{x_vec2, y_vec2, a_mat2, b_mat2, c_mat2, ap_mat2, scalar2};
-    auto status1 = run_reference_(args, buffers1, queue_);
-    auto status2 = run_routine_(args, buffers2, queue_);
+
+    // Runs CLBlast
+    if (verbose_) {
+      fprintf(stdout, "[CLBlast]");
+      std::cout << std::flush;
+    }
+    const auto status2 = run_routine_(args, buffers2, queue_);
+
+    // Runs the reference code
+    if (verbose_) {
+      if (compare_clblas_) { fprintf(stdout, " [clBLAS]"); }
+      else if (compare_cblas_) { fprintf(stdout, " [CPU BLAS]"); }
+      std::cout << std::flush;
+    }
+    const auto status1 = run_reference_(args, buffers1, queue_);
 
     // Tests for equality of the two status codes
+    if (verbose_) { fprintf(stdout, " -> "); std::cout << std::flush; }
     TestErrorCodes(status1, status2, args);
   }
   TestEnd();
@@ -222,6 +261,7 @@ void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const st
 // =================================================================================================
 
 // Compiles the templated class
+template class TestBlas<half, half>;
 template class TestBlas<float, float>;
 template class TestBlas<double, double>;
 template class TestBlas<float2, float2>;
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.hpp
index 4ffc1558..d01cd06c 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.hpp
@@ -21,7 +21,7 @@
 #include <string>
 #include <algorithm>
 
-#include "correctness/tester.h"
+#include "test/correctness/tester.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -46,6 +46,8 @@ class TestBlas: public Tester<T,U> {
   using Tester<T,U>::TestErrorCount;
   using Tester<T,U>::TestErrorCodes;
   using Tester<T,U>::GetOffsets;
+  using Tester<T,U>::GetOptionsString;
+  using Tester<T,U>::GetSizesString;
 
   // Test settings for the regular test. Append to these lists in case more tests are required.
   const std::vector<size_t> kVectorDims = { 7, 93, 4096 };
@@ -113,7 +115,7 @@ class TestBlas: public Tester<T,U> {
 // The interface to the correctness tester. This is a separate function in the header such that it
 // is automatically compiled for each routine, templated by the parameter "C".
 template <typename C, typename T, typename U>
-void RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
+size_t RunTests(int argc, char *argv[], const bool silent, const std::string &name) {
 
   // Sets the reference to test against
   #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
@@ -127,6 +129,13 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
     const auto reference_routine2 = C::RunReference2; // CBLAS
   #endif
 
+  // Non-BLAS routines cannot be fully tested
+  if (!silent && C::BLASLevel() == 4) {
+    fprintf(stdout, "\n* NOTE: This non-BLAS routine is tested against a custom implementation,\n");
+    fprintf(stdout, "  not against clBLAS or a CPU BLAS library. Thus, the arguments '-clblas'\n");
+    fprintf(stdout, "  and '-cblas' have no effect.\n");
+  }
+
   // Creates a tester
   auto options = C::GetOptions();
   TestBlas<T,U> tester{argc, argv, silent, name, options,
@@ -174,8 +183,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
   auto ap_sizes = std::vector<size_t>{args.ap_size};
 
   // Sets the dimensions of the matrices or vectors depending on the BLAS level
-  auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims :
-                    (C::BLASLevel() == 2) ? tester.kMatrixVectorDims :
+  auto dimensions = (C::BLASLevel() == 4) ? tester.kMatrixDims : // non-BLAS extra routines
+                    (C::BLASLevel() == 3) ? tester.kMatrixDims : // level 3
+                    (C::BLASLevel() == 2) ? tester.kMatrixVectorDims : // level 2
                     tester.kVectorDims; // else: level 1
 
   // For the options relevant to this routine, sets the vectors to proper values
@@ -316,7 +326,9 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
               // Runs the tests
               tester.TestRegular(regular_test_vector, case_name);
               #ifdef CLBLAST_REF_CLBLAS
-                tester.TestInvalid(invalid_test_vector, case_name);
+                if (C::BLASLevel() != 4) {
+                  tester.TestInvalid(invalid_test_vector, case_name);
+                }
               #endif
             }
           }
@@ -324,6 +336,7 @@ void RunTests(int argc, char *argv[], const bool silent, const std::string &name
       }
     }
   }
+  return tester.NumFailedTests();
 }
 
 // =================================================================================================
diff --git a/test/correctness/tester.cc b/test/correctness/tester.cpp
index 85ae7091..92e2c1b8 100644
--- a/test/correctness/tester.cc
+++ b/test/correctness/tester.cpp
@@ -11,13 +11,13 @@
 //
 // =================================================================================================
 
-#include "correctness/tester.h"
-
 #include <string>
 #include <vector>
 #include <iostream>
 #include <cmath>
 
+#include "test/correctness/tester.hpp"
+
 namespace clblast {
 // =================================================================================================
 
@@ -45,8 +45,8 @@ Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
 
   // Determines which reference to test against
   #if defined(CLBLAST_REF_CLBLAS) && defined(CLBLAST_REF_CBLAS)
-    compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 1);
-    compare_cblas_  = GetArgument(argc, argv, help_, kArgComparecblas, 0);
+    compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 0);
+    compare_cblas_  = GetArgument(argc, argv, help_, kArgComparecblas, 1);
   #elif CLBLAST_REF_CLBLAS
     compare_clblas_ = GetArgument(argc, argv, help_, kArgCompareclblas, 1);
     compare_cblas_ = 0;
@@ -137,7 +137,7 @@ void Tester<T,U>::TestStart(const std::string &test_name, const std::string &tes
   fprintf(stdout, "* Testing %s'%s'%s for %s'%s'%s:\n",
           kPrintMessage.c_str(), test_name.c_str(), kPrintEnd.c_str(),
           kPrintMessage.c_str(), test_configuration.c_str(), kPrintEnd.c_str());
-  fprintf(stdout, "   ");
+  if (!verbose_) { fprintf(stdout, "   "); }
 
   // Empties the error log and the error/pass counters
   error_log_.clear();
@@ -151,7 +151,7 @@ void Tester<T,U>::TestStart(const std::string &test_name, const std::string &tes
 // summary of the number of sub-tests passed/failed.
 template <typename T, typename U>
 void Tester<T,U>::TestEnd() {
-  fprintf(stdout, "\n");
+  if (!verbose_) { fprintf(stdout, "\n"); }
   tests_passed_ += num_passed_;
   tests_skipped_ += num_skipped_;
   tests_failed_ += num_failed_;
@@ -250,6 +250,53 @@ const std::vector<size_t> Tester<T,U>::GetOffsets() const {
   else { return {0}; }
 }
 
+// Retrieves the options as a string for a specific test
+template <typename T, typename U>
+std::string Tester<T,U>::GetOptionsString(const Arguments<U> &args) {
+  auto result = std::string("");
+  const auto equals = std::string("=");
+  for (auto &o: options_) {
+    if (o == kArgM)        { result += kArgM + equals + ToString(args.m) + " "; }
+    if (o == kArgN)        { result += kArgN + equals + ToString(args.n) + " "; }
+    if (o == kArgK)        { result += kArgK + equals + ToString(args.k) + " "; }
+    if (o == kArgKU)       { result += kArgKU + equals + ToString(args.ku) + " "; }
+    if (o == kArgKL)       { result += kArgKL + equals + ToString(args.kl) + " "; }
+    if (o == kArgXInc)     { result += kArgXInc + equals + ToString(args.x_inc) + " "; }
+    if (o == kArgYInc)     { result += kArgYInc + equals + ToString(args.y_inc) + " "; }
+    if (o == kArgXOffset)  { result += kArgXOffset + equals + ToString(args.x_offset) + " "; }
+    if (o == kArgYOffset)  { result += kArgYOffset + equals + ToString(args.y_offset) + " "; }
+    if (o == kArgALeadDim) { result += kArgALeadDim + equals + ToString(args.a_ld) + " "; }
+    if (o == kArgBLeadDim) { result += kArgBLeadDim + equals + ToString(args.b_ld) + " "; }
+    if (o == kArgCLeadDim) { result += kArgCLeadDim + equals + ToString(args.c_ld) + " "; }
+    if (o == kArgAOffset)  { result += kArgAOffset + equals + ToString(args.a_offset) + " "; }
+    if (o == kArgBOffset)  { result += kArgBOffset + equals + ToString(args.b_offset) + " "; }
+    if (o == kArgCOffset)  { result += kArgCOffset + equals + ToString(args.c_offset) + " "; }
+    if (o == kArgAPOffset) { result += kArgAPOffset + equals + ToString(args.ap_offset) + " "; }
+    if (o == kArgDotOffset){ result += kArgDotOffset + equals + ToString(args.dot_offset) + " "; }
+  }
+  return result;
+}
+
+// As above, but now only prints information relevant to invalid buffer sizes
+template <typename T, typename U>
+std::string Tester<T,U>::GetSizesString(const Arguments<U> &args) {
+  auto result = std::string("");
+  const auto equals = std::string("=");
+  for (auto &o: options_) {
+    if (o == kArgM)        { result += kArgM + equals + ToString(args.m) + " "; }
+    if (o == kArgN)        { result += kArgN + equals + ToString(args.n) + " "; }
+    if (o == kArgK)        { result += kArgK + equals + ToString(args.k) + " "; }
+    if (o == kArgXOffset)  { result += "xsize" + equals + ToString(args.x_size) + " "; }
+    if (o == kArgYOffset)  { result += "ysize" + equals + ToString(args.y_size) + " "; }
+    if (o == kArgAOffset)  { result += "asize" + equals + ToString(args.a_size) + " "; }
+    if (o == kArgBOffset)  { result += "bsize" + equals + ToString(args.b_size) + " "; }
+    if (o == kArgCOffset)  { result += "csize" + equals + ToString(args.c_size) + " "; }
+    if (o == kArgAPOffset) { result += "apsize" + equals + ToString(args.ap_size) + " "; }
+    if (o == kArgDotOffset){ result += "scalarsize" + equals + ToString(args.scalar_size) + " "; }
+  }
+  return result;
+}
+
 // =================================================================================================
 
 // A test can either pass, be skipped, or fail
@@ -273,13 +320,19 @@ void Tester<T,U>::ReportError(const ErrorLogEntry &error_log_entry) {
 // line by printing newlines once every so many calls.
 template <typename T, typename U>
 void Tester<T,U>::PrintTestResult(const std::string &message) {
-  if (print_count_ == kResultsPerLine) {
-    print_count_ = 0;
-    fprintf(stdout, "\n   ");
+  if (verbose_) {
+    fprintf(stdout, "%s\n", message.c_str());
+  }
+  else
+  {
+    if (print_count_ == kResultsPerLine) {
+      print_count_ = 0;
+      fprintf(stdout, "\n   ");
+    }
+    fprintf(stdout, "%s", message.c_str());
+    print_count_++;
   }
-  fprintf(stdout, "%s", message.c_str());
   std::cout << std::flush;
-  print_count_++;
 }
 
 // Prints details of errors occurred in a given error log
@@ -292,32 +345,7 @@ void Tester<T,U>::PrintErrorLog(const std::vector<ErrorLogEntry> &error_log) {
     else {
       fprintf(stdout, "   Status code %d (expected %d): ", entry.status_found, entry.status_expect);
     }
-    for (auto &o: options_) {
-      if (o == kArgM)        { fprintf(stdout, "%s=%zu ", kArgM, entry.args.m); }
-      if (o == kArgN)        { fprintf(stdout, "%s=%zu ", kArgN, entry.args.n); }
-      if (o == kArgK)        { fprintf(stdout, "%s=%zu ", kArgK, entry.args.k); }
-      if (o == kArgKU)       { fprintf(stdout, "%s=%zu ", kArgKU, entry.args.ku); }
-      if (o == kArgKL)       { fprintf(stdout, "%s=%zu ", kArgKL, entry.args.kl); }
-      if (o == kArgLayout)   { fprintf(stdout, "%s=%d ", kArgLayout, entry.args.layout);}
-      if (o == kArgATransp)  { fprintf(stdout, "%s=%d ", kArgATransp, entry.args.a_transpose);}
-      if (o == kArgBTransp)  { fprintf(stdout, "%s=%d ", kArgBTransp, entry.args.b_transpose);}
-      if (o == kArgSide)     { fprintf(stdout, "%s=%d ", kArgSide, entry.args.side);}
-      if (o == kArgTriangle) { fprintf(stdout, "%s=%d ", kArgTriangle, entry.args.triangle);}
-      if (o == kArgDiagonal) { fprintf(stdout, "%s=%d ", kArgDiagonal, entry.args.diagonal);}
-      if (o == kArgXInc)     { fprintf(stdout, "%s=%zu ", kArgXInc, entry.args.x_inc);}
-      if (o == kArgYInc)     { fprintf(stdout, "%s=%zu ", kArgYInc, entry.args.y_inc);}
-      if (o == kArgXOffset)  { fprintf(stdout, "%s=%zu ", kArgXOffset, entry.args.x_offset);}
-      if (o == kArgYOffset)  { fprintf(stdout, "%s=%zu ", kArgYOffset, entry.args.y_offset);}
-      if (o == kArgALeadDim) { fprintf(stdout, "%s=%zu ", kArgALeadDim, entry.args.a_ld);}
-      if (o == kArgBLeadDim) { fprintf(stdout, "%s=%zu ", kArgBLeadDim, entry.args.b_ld);}
-      if (o == kArgCLeadDim) { fprintf(stdout, "%s=%zu ", kArgCLeadDim, entry.args.c_ld);}
-      if (o == kArgAOffset)  { fprintf(stdout, "%s=%zu ", kArgAOffset, entry.args.a_offset);}
-      if (o == kArgBOffset)  { fprintf(stdout, "%s=%zu ", kArgBOffset, entry.args.b_offset);}
-      if (o == kArgCOffset)  { fprintf(stdout, "%s=%zu ", kArgCOffset, entry.args.c_offset);}
-      if (o == kArgAPOffset) { fprintf(stdout, "%s=%zu ", kArgAPOffset, entry.args.ap_offset);}
-      if (o == kArgDotOffset){ fprintf(stdout, "%s=%zu ", kArgDotOffset, entry.args.dot_offset);}
-    }
-    fprintf(stdout, "\n");
+    fprintf(stdout, "%s\n", GetOptionsString(entry.args).c_str());
   }
 }
 
@@ -351,11 +379,11 @@ bool TestSimilarity(const T val1, const T val2) {
   }
 }
 
-// Compiles the default case for non-complex data-types
+// Compiles the default case for standard data-types
 template bool TestSimilarity<float>(const float, const float);
 template bool TestSimilarity<double>(const double, const double);
 
-// Specialisations for complex data-types
+// Specialisations for non-standard data-types
 template <>
 bool TestSimilarity(const float2 val1, const float2 val2) {
   auto real = TestSimilarity(val1.real(), val2.real());
@@ -368,6 +396,10 @@ bool TestSimilarity(const double2 val1, const double2 val2) {
   auto imag = TestSimilarity(val1.imag(), val2.imag());
   return (real && imag);
 }
+template <>
+bool TestSimilarity(const half val1, const half val2) {
+  return TestSimilarity(HalfToFloat(val1), HalfToFloat(val2));
+}
 
 // =================================================================================================
 
@@ -389,10 +421,15 @@ template <> const std::vector<double2> GetExampleScalars(const bool full_test) {
   if (full_test) { return {{0.0, 0.0}, {1.0, 1.3}, {2.42, 3.14}}; }
   else { return {{2.42, 3.14}}; }
 }
+template <> const std::vector<half> GetExampleScalars(const bool full_test) {
+  if (full_test) { return {FloatToHalf(0.0f), FloatToHalf(1.0f), FloatToHalf(3.14f)}; }
+  else { return {FloatToHalf(3.14f)}; }
+}
 
 // =================================================================================================
 
 // Compiles the templated class
+template class Tester<half, half>;
 template class Tester<float, float>;
 template class Tester<double, double>;
 template class Tester<float2, float2>;
diff --git a/test/correctness/tester.h b/test/correctness/tester.hpp
index 46d88caf..422da9ed 100644
--- a/test/correctness/tester.h
+++ b/test/correctness/tester.hpp
@@ -28,7 +28,7 @@
 #endif
 #include "clblast.h"
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -58,7 +58,7 @@ class Tester {
   const std::string kErrorStatus{kPrintError + "/" + kPrintEnd};
   const std::string kSkippedCompilation{kPrintWarning + "\\" + kPrintEnd};
   const std::string kUnsupportedPrecision{kPrintWarning + "o" + kPrintEnd};
-  const std::string kUnsupportedReference{kPrintWarning + "." + kPrintEnd};
+  const std::string kUnsupportedReference{kPrintWarning + "-" + kPrintEnd};
 
   // This structure combines the above log-entry with a status code an error percentage
   struct ErrorLogEntry {
@@ -83,6 +83,9 @@ class Tester {
   void TestErrorCodes(const StatusCode clblas_status, const StatusCode clblast_status,
                       const Arguments<U> &args);
 
+  // Returns the number of failed tests
+  size_t NumFailedTests() const { return tests_failed_; }
+
  protected:
 
   // The help-message
@@ -103,6 +106,10 @@ class Tester {
   // Retrieves the offset values to test with
   const std::vector<size_t> GetOffsets() const;
 
+  // Retrieves the list of options as a string
+  std::string GetOptionsString(const Arguments<U> &args); // for regular tests
+  std::string GetSizesString(const Arguments<U> &args); // for invalid buffer sizes
+
   // Testing against reference implementations
   int compare_cblas_;
   int compare_clblas_;
diff --git a/test/performance/client.cc b/test/performance/client.cpp
index 9aaf1e4e..d0068f8b 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cpp
@@ -11,7 +11,7 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
+#include "test/performance/client.hpp"
 
 #include <string>
 #include <vector>
@@ -42,8 +42,10 @@ Client<T,U>::Client(const Routine run_routine,
 // applicable, but are searched for anyway to be able to create one common argument parser. All
 // arguments have a default value in case they are not found.
 template <typename T, typename U>
-Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
-                                         const GetMetric default_b_ld, const GetMetric default_c_ld) {
+Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t level,
+                                         const GetMetric default_a_ld,
+                                         const GetMetric default_b_ld,
+                                         const GetMetric default_c_ld) {
   auto args = Arguments<U>{};
   auto help = std::string{"\n* Options given/available:\n"};
 
@@ -116,6 +118,28 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
   // which is thus always displayed (unless silence is specified).
   if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); }
 
+  // Comparison against a non-BLAS routine is not supported
+  if (level == 4) { // level-4 == level-X
+    if (args.compare_clblas != 0 || args.compare_cblas != 0) {
+      if (!args.silent) {
+        fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n");
+      }
+    }
+    args.compare_clblas = 0;
+    args.compare_cblas = 0;
+  }
+
+  // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision
+  if (args.precision == Precision::kHalf) {
+    if (args.compare_clblas != 0 || args.compare_cblas != 0) {
+      if (!args.silent) {
+        fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for half-precision\n\n");
+      }
+    }
+    args.compare_clblas = 0;
+    args.compare_cblas = 0;
+  }
+
   // Returns the arguments
   return args;
 }
@@ -339,6 +363,7 @@ void Client<T,U>::PrintTableRow(const Arguments<U>& args,
 // =================================================================================================
 
 // Compiles the templated class
+template class Client<half,half>;
 template class Client<float,float>;
 template class Client<double,double>;
 template class Client<float2,float2>;
diff --git a/test/performance/client.h b/test/performance/client.hpp
index 8d0597d7..5ff2aec7 100644
--- a/test/performance/client.h
+++ b/test/performance/client.hpp
@@ -31,7 +31,7 @@
 #endif
 #include "clblast.h"
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 namespace clblast {
 // =================================================================================================
@@ -53,8 +53,10 @@ class Client {
 
   // Parses all command-line arguments, filling in the arguments structure. If no command-line
   // argument is given for a particular argument, it is filled in with a default value.
-  Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
-                              const GetMetric default_b_ld, const GetMetric default_c_ld);
+  Arguments<U> ParseArguments(int argc, char *argv[], const size_t level,
+                              const GetMetric default_a_ld,
+                              const GetMetric default_b_ld,
+                              const GetMetric default_c_ld);
 
   // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
   // calls the client routines.
@@ -97,14 +99,14 @@ void RunClient(int argc, char *argv[]) {
 
   // Sets the reference to test against
   #ifdef CLBLAST_REF_CLBLAS
-    const auto reference1 = C::RunReference1; // clBLAS when available
+    auto reference1 = C::RunReference1; // clBLAS when available
   #else
-    const auto reference1 = ReferenceNotAvailable<T,U>;
+    auto reference1 = ReferenceNotAvailable<T,U>;
   #endif
   #ifdef CLBLAST_REF_CBLAS
-    const auto reference2 = C::RunReference2; // CBLAS when available
+    auto reference2 = C::RunReference2; // CBLAS when available
   #else
-    const auto reference2 = ReferenceNotAvailable<T,U>;
+    auto reference2 = ReferenceNotAvailable<T,U>;
   #endif
 
   // Creates a new client
@@ -112,7 +114,8 @@ void RunClient(int argc, char *argv[]) {
                             C::GetFlops, C::GetBytes);
 
   // Simple command line argument parser with defaults
-  auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
+  auto args = client.ParseArguments(argc, argv, C::BLASLevel(),
+                                    C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
   if (args.print_help) { return; }
 
   // Runs the client
diff --git a/test/performance/routines/level1/xamax.cc b/test/performance/routines/level1/xamax.cpp
index 85caa483..450678e0 100644
--- a/test/performance/routines/level1/xamax.cc
+++ b/test/performance/routines/level1/xamax.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xamax.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xamax.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXamax<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXamax<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xasum.cc b/test/performance/routines/level1/xasum.cpp
index 2680966e..c21102f5 100644
--- a/test/performance/routines/level1/xasum.cc
+++ b/test/performance/routines/level1/xasum.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xasum.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xasum.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXasum<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXasum<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xaxpy.cc b/test/performance/routines/level1/xaxpy.cpp
index b423bc3a..e1c4935e 100644
--- a/test/performance/routines/level1/xaxpy.cc
+++ b/test/performance/routines/level1/xaxpy.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xaxpy.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xaxpy.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXaxpy<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXaxpy<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xcopy.cc b/test/performance/routines/level1/xcopy.cpp
index c04c6c1c..ea3531a0 100644
--- a/test/performance/routines/level1/xcopy.cc
+++ b/test/performance/routines/level1/xcopy.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xcopy.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xcopy.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXcopy<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXcopy<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xdot.cc b/test/performance/routines/level1/xdot.cpp
index f4616464..09fe9258 100644
--- a/test/performance/routines/level1/xdot.cc
+++ b/test/performance/routines/level1/xdot.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xdot.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xdot.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXdot<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXdot<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xdotc.cc b/test/performance/routines/level1/xdotc.cpp
index 5f36b80e..6e716ebb 100644
--- a/test/performance/routines/level1/xdotc.cc
+++ b/test/performance/routines/level1/xdotc.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xdotc.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xdotc.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level1/xdotu.cc b/test/performance/routines/level1/xdotu.cpp
index f19f751b..d011d558 100644
--- a/test/performance/routines/level1/xdotu.cc
+++ b/test/performance/routines/level1/xdotu.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xdotu.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xdotu.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level1/xnrm2.cc b/test/performance/routines/level1/xnrm2.cpp
index db6ec9ad..1d6e177d 100644
--- a/test/performance/routines/level1/xnrm2.cc
+++ b/test/performance/routines/level1/xnrm2.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xnrm2.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xnrm2.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXnrm2<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXnrm2<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xrot.cc b/test/performance/routines/level1/xrot.cpp
index 3ff59ace..4b543f1b 100644
--- a/test/performance/routines/level1/xrot.cc
+++ b/test/performance/routines/level1/xrot.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xrot.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xrot.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level1/xrotg.cc b/test/performance/routines/level1/xrotg.cpp
index 0320c314..e52704b0 100644
--- a/test/performance/routines/level1/xrotg.cc
+++ b/test/performance/routines/level1/xrotg.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xrotg.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xrotg.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level1/xrotm.cc b/test/performance/routines/level1/xrotm.cpp
index 7af94d0f..83ee1d9d 100644
--- a/test/performance/routines/level1/xrotm.cc
+++ b/test/performance/routines/level1/xrotm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xrotm.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xrotm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level1/xrotmg.cc b/test/performance/routines/level1/xrotmg.cpp
index a326347b..ee1539d9 100644
--- a/test/performance/routines/level1/xrotmg.cc
+++ b/test/performance/routines/level1/xrotmg.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xrotmg.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xrotmg.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level1/xscal.cc b/test/performance/routines/level1/xscal.cpp
index bd38f43e..adb83a90 100644
--- a/test/performance/routines/level1/xscal.cc
+++ b/test/performance/routines/level1/xscal.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xscal.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xscal.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXscal<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXscal<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level1/xswap.cc b/test/performance/routines/level1/xswap.cpp
index 112641d3..7f591d19 100644
--- a/test/performance/routines/level1/xswap.cc
+++ b/test/performance/routines/level1/xswap.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level1/xswap.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level1/xswap.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXswap<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXswap<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xgbmv.cc b/test/performance/routines/level2/xgbmv.cpp
index b050184d..6aa72ded 100644
--- a/test/performance/routines/level2/xgbmv.cc
+++ b/test/performance/routines/level2/xgbmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xgbmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xgbmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXgbmv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXgbmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xgemv.cc b/test/performance/routines/level2/xgemv.cpp
index 51ab9a10..fdcef95d 100644
--- a/test/performance/routines/level2/xgemv.cc
+++ b/test/performance/routines/level2/xgemv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xgemv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xgemv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXgemv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXgemv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xger.cc b/test/performance/routines/level2/xger.cpp
index 2d956346..c4f3699d 100644
--- a/test/performance/routines/level2/xger.cc
+++ b/test/performance/routines/level2/xger.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xger.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xger.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXger<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXger<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xgerc.cc b/test/performance/routines/level2/xgerc.cpp
index acd0fab7..f855dc11 100644
--- a/test/performance/routines/level2/xgerc.cc
+++ b/test/performance/routines/level2/xgerc.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xgerc.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xgerc.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xgeru.cc b/test/performance/routines/level2/xgeru.cpp
index a5973777..2bf885e3 100644
--- a/test/performance/routines/level2/xgeru.cc
+++ b/test/performance/routines/level2/xgeru.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xgeru.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xgeru.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xhbmv.cc b/test/performance/routines/level2/xhbmv.cpp
index 28b71045..b7f3b9ad 100644
--- a/test/performance/routines/level2/xhbmv.cc
+++ b/test/performance/routines/level2/xhbmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xhbmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xhbmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xhemv.cc b/test/performance/routines/level2/xhemv.cpp
index 622854a7..e1168083 100644
--- a/test/performance/routines/level2/xhemv.cc
+++ b/test/performance/routines/level2/xhemv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xhemv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xhemv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xher.cc b/test/performance/routines/level2/xher.cpp
index 613d7766..0d1bc1dd 100644
--- a/test/performance/routines/level2/xher.cc
+++ b/test/performance/routines/level2/xher.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xher.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xher.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xher2.cc b/test/performance/routines/level2/xher2.cpp
index c335d3be..3d98c838 100644
--- a/test/performance/routines/level2/xher2.cc
+++ b/test/performance/routines/level2/xher2.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xher2.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xher2.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xhpmv.cc b/test/performance/routines/level2/xhpmv.cpp
index 1e726569..c3bc3d9c 100644
--- a/test/performance/routines/level2/xhpmv.cc
+++ b/test/performance/routines/level2/xhpmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xhpmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xhpmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xhpr.cc b/test/performance/routines/level2/xhpr.cpp
index 000b69af..afc65b25 100644
--- a/test/performance/routines/level2/xhpr.cc
+++ b/test/performance/routines/level2/xhpr.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xhpr.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xhpr.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xhpr2.cc b/test/performance/routines/level2/xhpr2.cpp
index 19bafc46..c543dc90 100644
--- a/test/performance/routines/level2/xhpr2.cc
+++ b/test/performance/routines/level2/xhpr2.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xhpr2.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xhpr2.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xsbmv.cc b/test/performance/routines/level2/xsbmv.cpp
index eabab3b7..32899a74 100644
--- a/test/performance/routines/level2/xsbmv.cc
+++ b/test/performance/routines/level2/xsbmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xsbmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xsbmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsbmv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsbmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xspmv.cc b/test/performance/routines/level2/xspmv.cpp
index 2a9ef925..0b0d2409 100644
--- a/test/performance/routines/level2/xspmv.cc
+++ b/test/performance/routines/level2/xspmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xspmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xspmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXspmv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXspmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xspr.cc b/test/performance/routines/level2/xspr.cpp
index 84331d74..9c1c80a0 100644
--- a/test/performance/routines/level2/xspr.cc
+++ b/test/performance/routines/level2/xspr.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xspr.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xspr.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXspr<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXspr<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xspr2.cc b/test/performance/routines/level2/xspr2.cpp
index c42009a1..117e9c2f 100644
--- a/test/performance/routines/level2/xspr2.cc
+++ b/test/performance/routines/level2/xspr2.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xspr2.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xspr2.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXspr2<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXspr2<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xsymv.cc b/test/performance/routines/level2/xsymv.cpp
index 3f72fe77..60db1ae9 100644
--- a/test/performance/routines/level2/xsymv.cc
+++ b/test/performance/routines/level2/xsymv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xsymv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xsymv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsymv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsymv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xsyr.cc b/test/performance/routines/level2/xsyr.cpp
index 6b31d3a9..d9ecd38a 100644
--- a/test/performance/routines/level2/xsyr.cc
+++ b/test/performance/routines/level2/xsyr.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xsyr.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xsyr.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsyr<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyr<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xsyr2.cc b/test/performance/routines/level2/xsyr2.cpp
index 0ad59d2d..24e0a517 100644
--- a/test/performance/routines/level2/xsyr2.cc
+++ b/test/performance/routines/level2/xsyr2.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xsyr2.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xsyr2.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsyr2<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyr2<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xtbmv.cc b/test/performance/routines/level2/xtbmv.cpp
index a3297f34..ed9d26a8 100644
--- a/test/performance/routines/level2/xtbmv.cc
+++ b/test/performance/routines/level2/xtbmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xtbmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xtbmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXtbmv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtbmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xtbsv.cc b/test/performance/routines/level2/xtbsv.cpp
index 4dcd9a06..f0b80330 100644
--- a/test/performance/routines/level2/xtbsv.cc
+++ b/test/performance/routines/level2/xtbsv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xtbsv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xtbsv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xtpmv.cc b/test/performance/routines/level2/xtpmv.cpp
index 72477f2d..c5801205 100644
--- a/test/performance/routines/level2/xtpmv.cc
+++ b/test/performance/routines/level2/xtpmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xtpmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xtpmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXtpmv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtpmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xtpsv.cc b/test/performance/routines/level2/xtpsv.cpp
index a3e3f7f1..db956c9d 100644
--- a/test/performance/routines/level2/xtpsv.cc
+++ b/test/performance/routines/level2/xtpsv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xtpsv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xtpsv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level2/xtrmv.cc b/test/performance/routines/level2/xtrmv.cpp
index 894a7952..629c773c 100644
--- a/test/performance/routines/level2/xtrmv.cc
+++ b/test/performance/routines/level2/xtrmv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xtrmv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xtrmv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXtrmv<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrmv<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level2/xtrsv.cc b/test/performance/routines/level2/xtrsv.cpp
index e8c65b0f..d6c2968c 100644
--- a/test/performance/routines/level2/xtrsv.cc
+++ b/test/performance/routines/level2/xtrsv.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level2/xtrsv.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level2/xtrsv.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level3/xgemm.cc b/test/performance/routines/level3/xgemm.cpp
index 91897ee1..3f68096e 100644
--- a/test/performance/routines/level3/xgemm.cc
+++ b/test/performance/routines/level3/xgemm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xgemm.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xgemm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXgemm<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXgemm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level3/xhemm.cc b/test/performance/routines/level3/xhemm.cpp
index 87650b9e..ff6d0f71 100644
--- a/test/performance/routines/level3/xhemm.cc
+++ b/test/performance/routines/level3/xhemm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xhemm.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xhemm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level3/xher2k.cc b/test/performance/routines/level3/xher2k.cpp
index 06894816..9636959e 100644
--- a/test/performance/routines/level3/xher2k.cc
+++ b/test/performance/routines/level3/xher2k.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xher2k.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xher2k.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level3/xherk.cc b/test/performance/routines/level3/xherk.cpp
index d6f38fb2..d51400f0 100644
--- a/test/performance/routines/level3/xherk.cc
+++ b/test/performance/routines/level3/xherk.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xherk.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xherk.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
diff --git a/test/performance/routines/level3/xsymm.cc b/test/performance/routines/level3/xsymm.cpp
index e0feadd1..38c3dc9b 100644
--- a/test/performance/routines/level3/xsymm.cc
+++ b/test/performance/routines/level3/xsymm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xsymm.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xsymm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsymm<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsymm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level3/xsyr2k.cc b/test/performance/routines/level3/xsyr2k.cpp
index 4a82ddc4..5360e297 100644
--- a/test/performance/routines/level3/xsyr2k.cc
+++ b/test/performance/routines/level3/xsyr2k.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xsyr2k.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xsyr2k.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsyr2k<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyr2k<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level3/xsyrk.cc b/test/performance/routines/level3/xsyrk.cpp
index 70f61322..30612f99 100644
--- a/test/performance/routines/level3/xsyrk.cc
+++ b/test/performance/routines/level3/xsyrk.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xsyrk.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xsyrk.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXsyrk<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXsyrk<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level3/xtrmm.cc b/test/performance/routines/level3/xtrmm.cpp
index 6f6041e4..264a34e7 100644
--- a/test/performance/routines/level3/xtrmm.cc
+++ b/test/performance/routines/level3/xtrmm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xtrmm.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xtrmm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXtrmm<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrmm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/level3/xtrsm.cc b/test/performance/routines/level3/xtrsm.cpp
index 76ef255a..80c46d91 100644
--- a/test/performance/routines/level3/xtrsm.cc
+++ b/test/performance/routines/level3/xtrsm.cpp
@@ -9,8 +9,8 @@
 //
 // =================================================================================================
 
-#include "performance/client.h"
-#include "routines/level3/xtrsm.h"
+#include "test/performance/client.hpp"
+#include "test/routines/level3/xtrsm.hpp"
 
 // Shortcuts to the clblast namespace
 using float2 = clblast::float2;
@@ -19,7 +19,8 @@ using double2 = clblast::double2;
 // Main function (not within the clblast namespace)
 int main(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
-    case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXtrsm<half>, half, half>(argc, argv); break;
     case clblast::Precision::kSingle:
       clblast::RunClient<clblast::TestXtrsm<float>, float, float>(argc, argv); break;
     case clblast::Precision::kDouble:
diff --git a/test/performance/routines/levelx/xomatcopy.cpp b/test/performance/routines/levelx/xomatcopy.cpp
new file mode 100644
index 00000000..0bd5773e
--- /dev/null
+++ b/test/performance/routines/levelx/xomatcopy.cpp
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "test/performance/client.hpp"
+#include "test/routines/levelx/xomatcopy.hpp"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXomatcopy<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXomatcopy<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXomatcopy<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/routines/level1/xamax.h b/test/routines/level1/xamax.hpp
index 7b404dc3..4423845e 100644
--- a/test/routines/level1/xamax.h
+++ b/test/routines/level1/xamax.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -86,8 +86,8 @@ class TestXamax {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXamax<T>(args.n,
-                                   buffers.scalar(), args.imax_offset,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
+                                   buffers.scalar, args.imax_offset,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xasum.h b/test/routines/level1/xasum.hpp
index 6eae3c83..b1f02dcd 100644
--- a/test/routines/level1/xasum.h
+++ b/test/routines/level1/xasum.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -86,8 +86,8 @@ class TestXasum {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXasum<T>(args.n,
-                                   buffers.scalar(), args.asum_offset,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
+                                   buffers.scalar, args.asum_offset,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.hpp
index 8f72f570..c276a42e 100644
--- a/test/routines/level1/xaxpy.h
+++ b/test/routines/level1/xaxpy.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -87,8 +87,8 @@ class TestXaxpy {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXaxpy(args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.hpp
index 0527ca6a..a96bb9ae 100644
--- a/test/routines/level1/xcopy.h
+++ b/test/routines/level1/xcopy.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -86,8 +86,8 @@ class TestXcopy {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXcopy<T>(args.n,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
-                                   buffers.y_vec(), args.y_offset, args.y_inc,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers.y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.hpp
index d1c34c0f..f6cf2809 100644
--- a/test/routines/level1/xdot.h
+++ b/test/routines/level1/xdot.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,9 +91,9 @@ class TestXdot {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXdot<T>(args.n,
-                                  buffers.scalar(), args.dot_offset,
-                                  buffers.x_vec(), args.x_offset, args.x_inc,
-                                  buffers.y_vec(), args.y_offset, args.y_inc,
+                                  buffers.scalar, args.dot_offset,
+                                  buffers.x_vec, args.x_offset, args.x_inc,
+                                  buffers.y_vec, args.y_offset, args.y_inc,
                                   1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.hpp
index a2742cb0..2b00d04b 100644
--- a/test/routines/level1/xdotc.h
+++ b/test/routines/level1/xdotc.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,9 +91,9 @@ class TestXdotc {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXdotc<T>(args.n,
-                                   buffers.scalar(), args.dot_offset,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
-                                   buffers.y_vec(), args.y_offset, args.y_inc,
+                                   buffers.scalar, args.dot_offset,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers.y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.hpp
index 06ce979e..31a867e0 100644
--- a/test/routines/level1/xdotu.h
+++ b/test/routines/level1/xdotu.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,9 +91,9 @@ class TestXdotu {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXdotu<T>(args.n,
-                                   buffers.scalar(), args.dot_offset,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
-                                   buffers.y_vec(), args.y_offset, args.y_inc,
+                                   buffers.scalar, args.dot_offset,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers.y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.hpp
index d8a0de4e..62d649e3 100644
--- a/test/routines/level1/xnrm2.h
+++ b/test/routines/level1/xnrm2.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -86,8 +86,8 @@ class TestXnrm2 {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXnrm2<T>(args.n,
-                                   buffers.scalar(), args.nrm2_offset,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
+                                   buffers.scalar, args.nrm2_offset,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.hpp
index 35855dbd..79926890 100644
--- a/test/routines/level1/xscal.h
+++ b/test/routines/level1/xscal.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -82,7 +82,7 @@ class TestXscal {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXscal(args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
+                                buffers.x_vec, args.x_offset, args.x_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.hpp
index ae69d3be..8f7e4cfe 100644
--- a/test/routines/level1/xswap.h
+++ b/test/routines/level1/xswap.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -86,8 +86,8 @@ class TestXswap {
       auto queue_plain = queue();
       auto event = cl_event{};
       auto status = clblasXswap<T>(args.n,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
-                                   buffers.y_vec(), args.y_offset, args.y_inc,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
+                                   buffers.y_vec, args.y_offset, args.y_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.hpp
index c88cdf2a..5a907077 100644
--- a/test/routines/level2/xgbmv.h
+++ b/test/routines/level2/xgbmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -102,9 +102,9 @@ class TestXgbmv {
       auto status = clblasXgbmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.a_transpose),
                                 args.m, args.n, args.kl, args.ku, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.hpp
index cf63d55f..1499b2d2 100644
--- a/test/routines/level2/xgemv.h
+++ b/test/routines/level2/xgemv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -102,9 +102,9 @@ class TestXgemv {
       auto status = clblasXgemv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.a_transpose),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.hpp
index ae142e2e..5cbed505 100644
--- a/test/routines/level2/xger.h
+++ b/test/routines/level2/xger.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -97,9 +97,9 @@ class TestXger {
       auto event = cl_event{};
       auto status = clblasXger(convertToCLBLAS(args.layout),
                                args.m, args.n, args.alpha,
-                               buffers.x_vec(), args.x_offset, args.x_inc,
-                               buffers.y_vec(), args.y_offset, args.y_inc,
-                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.y_vec, args.y_offset, args.y_inc,
+                               buffers.a_mat, args.a_offset, args.a_ld,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.hpp
index b236aef6..d50092cb 100644
--- a/test/routines/level2/xgerc.h
+++ b/test/routines/level2/xgerc.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -97,9 +97,9 @@ class TestXgerc {
       auto event = cl_event{};
       auto status = clblasXgerc(convertToCLBLAS(args.layout),
                                 args.m, args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.hpp
index 3d3fa439..9c823b73 100644
--- a/test/routines/level2/xgeru.h
+++ b/test/routines/level2/xgeru.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -97,9 +97,9 @@ class TestXgeru {
       auto event = cl_event{};
       auto status = clblasXgeru(convertToCLBLAS(args.layout),
                                 args.m, args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.hpp
index 4098639a..01cb3f51 100644
--- a/test/routines/level2/xhbmv.h
+++ b/test/routines/level2/xhbmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXhbmv {
       auto status = clblasXhbmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.kl, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.hpp
index 5652872d..dadd3975 100644
--- a/test/routines/level2/xhemv.h
+++ b/test/routines/level2/xhemv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXhemv {
       auto status = clblasXhemv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.hpp
index 3bbf0887..b21c0a9b 100644
--- a/test/routines/level2/xher.h
+++ b/test/routines/level2/xher.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,8 +91,8 @@ class TestXher {
       auto status = clblasXher(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec(), args.x_offset, args.x_inc,
-                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.a_mat, args.a_offset, args.a_ld,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.hpp
index dc7fbe73..070f823c 100644
--- a/test/routines/level2/xher2.h
+++ b/test/routines/level2/xher2.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXher2 {
       auto status = clblasXher2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.hpp
index df5a90ee..d7f9634e 100644
--- a/test/routines/level2/xhpmv.h
+++ b/test/routines/level2/xhpmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXhpmv {
       auto status = clblasXhpmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.ap_mat(), args.ap_offset,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.ap_mat, args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.hpp
index 0db11db0..8f44a68d 100644
--- a/test/routines/level2/xhpr.h
+++ b/test/routines/level2/xhpr.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,8 +91,8 @@ class TestXhpr {
       auto status = clblasXhpr(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec(), args.x_offset, args.x_inc,
-                               buffers.ap_mat(), args.ap_offset,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.ap_mat, args.ap_offset,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.hpp
index e1e5b4c5..666a8dfc 100644
--- a/test/routines/level2/xhpr2.h
+++ b/test/routines/level2/xhpr2.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXhpr2 {
       auto status = clblasXhpr2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
-                                buffers.ap_mat(), args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.ap_mat, args.ap_offset,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.hpp
index fce88f4c..fd5dd68e 100644
--- a/test/routines/level2/xsbmv.h
+++ b/test/routines/level2/xsbmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXsbmv {
       auto status = clblasXsbmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.kl, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.hpp
index 2fdba77a..63286248 100644
--- a/test/routines/level2/xspmv.h
+++ b/test/routines/level2/xspmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXspmv {
       auto status = clblasXspmv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.ap_mat(), args.ap_offset,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.ap_mat, args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.hpp
index dcacc5de..f9dead53 100644
--- a/test/routines/level2/xspr.h
+++ b/test/routines/level2/xspr.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,8 +91,8 @@ class TestXspr {
       auto status = clblasXspr(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec(), args.x_offset, args.x_inc,
-                               buffers.ap_mat(), args.ap_offset,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.ap_mat, args.ap_offset,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.hpp
index 69fda2fb..a2f22098 100644
--- a/test/routines/level2/xspr2.h
+++ b/test/routines/level2/xspr2.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXspr2 {
       auto status = clblasXspr2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
-                                buffers.ap_mat(), args.ap_offset,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.ap_mat, args.ap_offset,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.hpp
index 16f94d6f..0d3ca632 100644
--- a/test/routines/level2/xsymv.h
+++ b/test/routines/level2/xsymv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXsymv {
       auto status = clblasXsymv(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc, args.beta,
+                                buffers.y_vec, args.y_offset, args.y_inc,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.hpp
index a66dd271..15ad9595 100644
--- a/test/routines/level2/xsyr.h
+++ b/test/routines/level2/xsyr.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -91,8 +91,8 @@ class TestXsyr {
       auto status = clblasXsyr(convertToCLBLAS(args.layout),
                                convertToCLBLAS(args.triangle),
                                args.n, args.alpha,
-                               buffers.x_vec(), args.x_offset, args.x_inc,
-                               buffers.a_mat(), args.a_offset, args.a_ld,
+                               buffers.x_vec, args.x_offset, args.x_inc,
+                               buffers.a_mat, args.a_offset, args.a_ld,
                                1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.hpp
index a36815e5..a9a61a1f 100644
--- a/test/routines/level2/xsyr2.h
+++ b/test/routines/level2/xsyr2.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -96,9 +96,9 @@ class TestXsyr2 {
       auto status = clblasXsyr2(convertToCLBLAS(args.layout),
                                 convertToCLBLAS(args.triangle),
                                 args.n, args.alpha,
-                                buffers.x_vec(), args.x_offset, args.x_inc,
-                                buffers.y_vec(), args.y_offset, args.y_inc,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
+                                buffers.x_vec, args.x_offset, args.x_inc,
+                                buffers.y_vec, args.y_offset, args.y_inc,
+                                buffers.a_mat, args.a_offset, args.a_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.hpp
index 1425b60b..54e7fe18 100644
--- a/test/routines/level2/xtbmv.h
+++ b/test/routines/level2/xtbmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -92,8 +92,8 @@ class TestXtbmv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n, args.kl,
-                                   buffers.a_mat(), args.a_offset, args.a_ld,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
+                                   buffers.a_mat, args.a_offset, args.a_ld,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.hpp
index a834b437..9776c4de 100644
--- a/test/routines/level2/xtpmv.h
+++ b/test/routines/level2/xtpmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -92,8 +92,8 @@ class TestXtpmv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n,
-                                   buffers.ap_mat(), args.ap_offset,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
+                                   buffers.ap_mat, args.ap_offset,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.hpp
index cd502d5d..18300e50 100644
--- a/test/routines/level2/xtrmv.h
+++ b/test/routines/level2/xtrmv.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -92,8 +92,8 @@ class TestXtrmv {
                                    convertToCLBLAS(args.a_transpose),
                                    convertToCLBLAS(args.diagonal),
                                    args.n,
-                                   buffers.a_mat(), args.a_offset, args.a_ld,
-                                   buffers.x_vec(), args.x_offset, args.x_inc,
+                                   buffers.a_mat, args.a_offset, args.a_ld,
+                                   buffers.x_vec, args.x_offset, args.x_inc,
                                    1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.hpp
index cd5c2acd..5f9bea81 100644
--- a/test/routines/level3/xgemm.h
+++ b/test/routines/level3/xgemm.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -105,9 +105,9 @@ class TestXgemm {
                                 convertToCLBLAS(args.a_transpose),
                                 convertToCLBLAS(args.b_transpose),
                                 args.m, args.n, args.k, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                                buffers.c_mat(), args.c_offset, args.c_ld,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.hpp
index edc71024..8c44be25 100644
--- a/test/routines/level3/xhemm.h
+++ b/test/routines/level3/xhemm.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -105,9 +105,9 @@ class TestXhemm {
                                 convertToCLBLAS(args.side),
                                 convertToCLBLAS(args.triangle),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                                buffers.c_mat(), args.c_offset, args.c_ld,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.hpp
index a78e1293..fd20bbb5 100644
--- a/test/routines/level3/xher2k.h
+++ b/test/routines/level3/xher2k.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -105,9 +105,9 @@ class TestXher2k {
                                  convertToCLBLAS(args.triangle),
                                  convertToCLBLAS(args.a_transpose),
                                  args.n, args.k, alpha2,
-                                 buffers.a_mat(), args.a_offset, args.a_ld,
-                                 buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                                 buffers.c_mat(), args.c_offset, args.c_ld,
+                                 buffers.a_mat, args.a_offset, args.a_ld,
+                                 buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                 buffers.c_mat, args.c_offset, args.c_ld,
                                  1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.hpp
index 245293d6..12990d39 100644
--- a/test/routines/level3/xherk.h
+++ b/test/routines/level3/xherk.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -95,8 +95,8 @@ class TestXherk {
                                 convertToCLBLAS(args.triangle),
                                 convertToCLBLAS(args.a_transpose),
                                 args.n, args.k, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                                buffers.c_mat(), args.c_offset, args.c_ld,
+                                buffers.a_mat, args.a_offset, args.a_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.hpp
index e638b735..f8e90927 100644
--- a/test/routines/level3/xsymm.h
+++ b/test/routines/level3/xsymm.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -105,9 +105,9 @@ class TestXsymm {
                                 convertToCLBLAS(args.side),
                                 convertToCLBLAS(args.triangle),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                                buffers.c_mat(), args.c_offset, args.c_ld,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.hpp
index abac20f4..4e4ba0b7 100644
--- a/test/routines/level3/xsyr2k.h
+++ b/test/routines/level3/xsyr2k.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -103,9 +103,9 @@ class TestXsyr2k {
                                  convertToCLBLAS(args.triangle),
                                  convertToCLBLAS(args.a_transpose),
                                  args.n, args.k, args.alpha,
-                                 buffers.a_mat(), args.a_offset, args.a_ld,
-                                 buffers.b_mat(), args.b_offset, args.b_ld, args.beta,
-                                 buffers.c_mat(), args.c_offset, args.c_ld,
+                                 buffers.a_mat, args.a_offset, args.a_ld,
+                                 buffers.b_mat, args.b_offset, args.b_ld, args.beta,
+                                 buffers.c_mat, args.c_offset, args.c_ld,
                                  1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.hpp
index 8a5fcb5f..f5509c88 100644
--- a/test/routines/level3/xsyrk.h
+++ b/test/routines/level3/xsyrk.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -95,8 +95,8 @@ class TestXsyrk {
                                 convertToCLBLAS(args.triangle),
                                 convertToCLBLAS(args.a_transpose),
                                 args.n, args.k, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld, args.beta,
-                                buffers.c_mat(), args.c_offset, args.c_ld,
+                                buffers.a_mat, args.a_offset, args.a_ld, args.beta,
+                                buffers.c_mat, args.c_offset, args.c_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.hpp
index 7c9c21bc..45e17e45 100644
--- a/test/routines/level3/xtrmm.h
+++ b/test/routines/level3/xtrmm.hpp
@@ -20,10 +20,10 @@
 #include <string>
 
 #ifdef CLBLAST_REF_CLBLAS
-  #include "wrapper_clblas.h"
+  #include "test/wrapper_clblas.hpp"
 #endif
 #ifdef CLBLAST_REF_CBLAS
-  #include "wrapper_cblas.h"
+  #include "test/wrapper_cblas.hpp"
 #endif
 
 namespace clblast {
@@ -97,8 +97,8 @@ class TestXtrmm {
                                 convertToCLBLAS(args.a_transpose),
                                 convertToCLBLAS(args.diagonal),
                                 args.m, args.n, args.alpha,
-                                buffers.a_mat(), args.a_offset, args.a_ld,
-                                buffers.b_mat(), args.b_offset, args.b_ld,
+                                buffers.a_mat, args.a_offset, args.a_ld,
+                                buffers.b_mat, args.b_offset, args.b_ld,
                                 1, &queue_plain, 0, nullptr, &event);
       clWaitForEvents(1, &event);
       return static_cast<StatusCode>(status);
diff --git a/test/routines/levelx/xomatcopy.hpp b/test/routines/levelx/xomatcopy.hpp
new file mode 100644
index 00000000..4637c07e
--- /dev/null
+++ b/test/routines/levelx/xomatcopy.hpp
@@ -0,0 +1,159 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xomatcopy routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
+
+#include <vector>
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXomatcopy {
+ public:
+
+  // The BLAS level: 4 for the extra routines
+  static size_t BLASLevel() { return 4; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgATransp,
+            kArgALeadDim, kArgBLeadDim,
+            kArgAOffset, kArgBOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    const auto a_rotated = (args.layout == Layout::kRowMajor);
+    const auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                           (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    const auto b_two = (b_rotated) ? args.n : args.m;
+    return b_two * args.b_ld + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which omatcopyose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Omatcopy<T>(args.layout, args.a_transpose,
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld,
+                              &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run a naive version of the routine (for correctness/performance comparison).
+  // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
+  static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+      return RunReference2(args, buffers, queue);
+  }
+
+  static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+
+    // Data transfer from OpenCL to std::vector
+    std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+    std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+    buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+    buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+
+    // Checking for invalid arguments
+    const auto a_rotated = (args.layout == Layout::kRowMajor);
+    const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                           (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    const auto a_base = (a_rotated) ? args.a_ld*(args.m-1) + args.n : args.a_ld*(args.n-1) + args.m;
+    const auto b_base = (b_rotated) ? args.b_ld*(args.m-1) + args.n : args.b_ld*(args.n-1) + args.m;
+    if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; }
+    if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; }
+    if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; }
+    if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; }
+    if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; }
+
+    // Matrix copy, scaling, and/or transpose
+    for (auto id1 = size_t{0}; id1 < args.m; ++id1) {
+      for (auto id2 = size_t{0}; id2 < args.n; ++id2) {
+        const auto a_one = (a_rotated) ? id2 : id1;
+        const auto a_two = (a_rotated) ? id1 : id2;
+        const auto b_one = (b_rotated) ? id2 : id1;
+        const auto b_two = (b_rotated) ? id1 : id2;
+        const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
+        const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
+        b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index];
+      }
+    }
+
+    // Data transfer back to OpenCL
+    buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+    return StatusCode::kSuccess;
+  }
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.Read(queue, args.b_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                           (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    const auto b_one = (b_rotated) ? id2 : id1;
+    const auto b_two = (b_rotated) ? id1 : id2;
+    return b_two * args.b_ld + b_one + args.b_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.m*args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
+#endif
diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.hpp
index 529acfbf..7bc674ab 100644
--- a/test/wrapper_cblas.h
+++ b/test/wrapper_cblas.hpp
@@ -20,7 +20,7 @@ extern "C"
   #include <cblas.h>
 }
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 namespace clblast {
 
@@ -161,6 +161,17 @@ void cblasXswap(const size_t n,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
+void cblasXswap(const size_t n,
+                std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXswap(n,
+             x_buffer_bis, x_offset, x_inc,
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL
 void cblasXscal(const size_t n,
@@ -193,6 +204,15 @@ void cblasXscal(const size_t n,
               alpha_array.data(),
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
+void cblasXscal(const size_t n,
+                const half alpha,
+                std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  cblasXscal(n,
+             HalfToFloat(alpha),
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY
 void cblasXcopy(const size_t n,
@@ -223,6 +243,16 @@ void cblasXcopy(const size_t n,
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
+void cblasXcopy(const size_t n,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXcopy(n,
+             x_buffer_bis, x_offset, x_inc,
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY
 void cblasXaxpy(const size_t n,
@@ -263,6 +293,18 @@ void cblasXaxpy(const size_t n,
               reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
+void cblasXaxpy(const size_t n,
+                const half alpha,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXaxpy(n,
+             HalfToFloat(alpha),
+             x_buffer_bis, x_offset, x_inc,
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SDOT/DDOT
 void cblasXdot(const size_t n,
@@ -281,6 +323,19 @@ void cblasXdot(const size_t n,
                                       &x_buffer[x_offset], static_cast<int>(x_inc),
                                       &y_buffer[y_offset], static_cast<int>(y_inc));
 }
+void cblasXdot(const size_t n,
+               std::vector<half>& dot_buffer, const size_t dot_offset,
+               const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+               const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer);
+  cblasXdot(n,
+            dot_buffer_bis, dot_offset,
+            x_buffer_bis, x_offset, x_inc,
+            y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(dot_buffer, dot_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for CDOTU/ZDOTU
 void cblasXdotu(const size_t n,
@@ -347,6 +402,16 @@ void cblasXnrm2(const size_t n,
   nrm2_buffer[nrm2_offset].real(cblas_dznrm2(n,
                                             reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
 }
+void cblasXnrm2(const size_t n,
+                std::vector<half>& nrm2_buffer, const size_t nrm2_offset,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer);
+  cblasXnrm2(n,
+             nrm2_buffer_bis, nrm2_offset,
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SASUM/DASUM/ScASUM/DzASUM
 void cblasXasum(const size_t n,
@@ -373,8 +438,18 @@ void cblasXasum(const size_t n,
   asum_buffer[asum_offset].real(cblas_dzasum(n,
                                             reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc)));
 }
+void cblasXasum(const size_t n,
+                std::vector<half>& asum_buffer, const size_t asum_offset,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer);
+  cblasXasum(n,
+             asum_buffer_bis, asum_offset,
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(asum_buffer, asum_buffer_bis);
+}
 
-// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX
+// Forwards the Netlib BLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 void cblasXamax(const size_t n,
                 std::vector<float>& imax_buffer, const size_t imax_offset,
                 const std::vector<float>& x_buffer, const size_t x_offset, const size_t x_inc) {
@@ -399,6 +474,16 @@ void cblasXamax(const size_t n,
   ((int*)&imax_buffer[0])[imax_offset] = cblas_izamax(n,
                                                      reinterpret_cast<const double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
+void cblasXamax(const size_t n,
+                std::vector<half>& imax_buffer, const size_t imax_offset,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer);
+  cblasXamax(n,
+             imax_buffer_bis, imax_offset,
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(imax_buffer, imax_buffer_bis);
+}
 
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
@@ -469,6 +554,25 @@ void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
               beta_array.data(),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
+void cblasXgemv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const half beta,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXgemv(layout, a_transpose,
+             m, n,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             x_buffer_bis, x_offset, x_inc,
+             HalfToFloat(beta),
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV
 void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
@@ -535,6 +639,25 @@ void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
               beta_array.data(),
               reinterpret_cast<double*>(&y_buffer[y_offset]), static_cast<int>(y_inc));
 }
+void cblasXgbmv(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const half beta,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXgbmv(layout, a_transpose,
+             m, n, kl, ku,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             x_buffer_bis, x_offset, x_inc,
+             HalfToFloat(beta),
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for CHEMV/ZHEMV
 void cblasXhemv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
@@ -675,6 +798,25 @@ void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
               beta,
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
+void cblasXsymv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const half beta,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXsymv(layout, triangle,
+             n,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             x_buffer_bis, x_offset, x_inc,
+             HalfToFloat(beta),
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSBMV/DSBMV
 void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
@@ -707,6 +849,25 @@ void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
               beta,
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
+void cblasXsbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n, const size_t k,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const half beta,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXsbmv(layout, triangle,
+             n, k,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             x_buffer_bis, x_offset, x_inc,
+             HalfToFloat(beta),
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSPMV/DSPMV
 void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
@@ -739,6 +900,25 @@ void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
               beta,
               &y_buffer[y_offset], static_cast<int>(y_inc));
 }
+void cblasXspmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const half alpha,
+                const std::vector<half>& ap_buffer, const size_t ap_offset,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const half beta,
+                std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc) {
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  cblasXspmv(layout, triangle,
+             n,
+             HalfToFloat(alpha),
+             ap_buffer_bis, ap_offset,
+             x_buffer_bis, x_offset, x_inc,
+             HalfToFloat(beta),
+             y_buffer_bis, y_offset, y_inc);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for STRMV/DTRMV/CTRMV/ZTRMV
 void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
@@ -777,6 +957,18 @@ void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
+void cblasXtrmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  cblasXtrmv(layout, triangle, a_transpose, diagonal,
+             n,
+             a_buffer_bis, a_offset, a_ld,
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for STBMV/DTBMV/CTBMV/ZTBMV
 void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
@@ -815,6 +1007,18 @@ void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
+void cblasXtbmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n, const size_t k,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  cblasXtbmv(layout, triangle, a_transpose, diagonal,
+             n, k,
+             a_buffer_bis, a_offset, a_ld,
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for STPMV/DTPMV/CTPMV/ZTPMV
 void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
@@ -853,6 +1057,18 @@ void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
               reinterpret_cast<const double*>(&ap_buffer[ap_offset]),
               reinterpret_cast<double*>(&x_buffer[x_offset]), static_cast<int>(x_inc));
 }
+void cblasXtpmv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t n,
+                const std::vector<half>& ap_buffer, const size_t ap_offset,
+                std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc) {
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  cblasXtpmv(layout, triangle, a_transpose, diagonal,
+             n,
+             ap_buffer_bis, ap_offset,
+             x_buffer_bis, x_offset, x_inc);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for STRSV/DTRSV/CTRSV/ZTRSV
 void cblasXtrsv(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
@@ -995,6 +1211,23 @@ void cblasXger(const CBLAS_ORDER layout,
              &y_buffer[y_offset], static_cast<int>(y_inc),
              &a_buffer[a_offset], a_ld);
 }
+void cblasXger(const CBLAS_ORDER layout,
+               const size_t m, const size_t n,
+               const half alpha,
+               const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+               const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+               std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  cblasXger(layout,
+            m, n,
+            HalfToFloat(alpha),
+            x_buffer_bis, x_offset, x_inc,
+            y_buffer_bis, y_offset, y_inc,
+            a_buffer_bis, a_offset, a_ld);
+  FloatToHalfBuffer(a_buffer, a_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for CGERU/ZGERU
 void cblasXgeru(const CBLAS_ORDER layout,
@@ -1187,6 +1420,20 @@ void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &a_buffer[a_offset], a_ld);
 }
+void cblasXsyr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const half alpha,
+               const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  cblasXsyr(layout, triangle,
+            n,
+            HalfToFloat(alpha),
+            x_buffer_bis, x_offset, x_inc,
+            a_buffer_bis, a_offset, a_ld);
+  FloatToHalfBuffer(a_buffer, a_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSPR/DSPR
 void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
@@ -1211,6 +1458,20 @@ void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
              &x_buffer[x_offset], static_cast<int>(x_inc),
              &ap_buffer[ap_offset]);
 }
+void cblasXspr(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+               const size_t n,
+               const half alpha,
+               const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+               std::vector<half>& ap_buffer, const size_t ap_offset) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer);
+  cblasXspr(layout, triangle,
+            n,
+            HalfToFloat(alpha),
+            x_buffer_bis, x_offset, x_inc,
+            ap_buffer_bis, ap_offset);
+  FloatToHalfBuffer(ap_buffer, ap_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSYR2/DSYR2
 void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
@@ -1239,6 +1500,23 @@ void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
               &y_buffer[y_offset], static_cast<int>(y_inc),
               &a_buffer[a_offset], a_ld);
 }
+void cblasXsyr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const half alpha,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  cblasXsyr2(layout, triangle,
+             n,
+             HalfToFloat(alpha),
+             x_buffer_bis, x_offset, x_inc,
+             y_buffer_bis, y_offset, y_inc,
+             a_buffer_bis, a_offset, a_ld);
+  FloatToHalfBuffer(a_buffer, a_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSPR2/DSPR2
 void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
@@ -1267,6 +1545,23 @@ void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
               &y_buffer[y_offset], static_cast<int>(y_inc),
               &ap_buffer[ap_offset]);
 }
+void cblasXspr2(const CBLAS_ORDER layout, const CBLAS_UPLO triangle,
+                const size_t n,
+                const half alpha,
+                const std::vector<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                const std::vector<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                std::vector<half>& ap_buffer, const size_t ap_offset) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer);
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer);
+  cblasXspr2(layout, triangle,
+             n,
+             HalfToFloat(alpha),
+             x_buffer_bis, x_offset, x_inc,
+             y_buffer_bis, y_offset, y_inc,
+             ap_buffer_bis, ap_offset);
+  FloatToHalfBuffer(ap_buffer, ap_buffer_bis);
+}
 
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
@@ -1337,6 +1632,25 @@ void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, con
               beta_array.data(),
               reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
 }
+void cblasXgemm(const CBLAS_ORDER layout, const CBLAS_TRANSPOSE a_transpose, const CBLAS_TRANSPOSE b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const half beta,
+                std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer);
+  cblasXgemm(layout, a_transpose, b_transpose,
+             m, n, k,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             b_buffer_bis, b_offset, b_ld,
+             HalfToFloat(beta),
+             c_buffer_bis, c_offset, c_ld);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM
 void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
@@ -1403,6 +1717,25 @@ void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
               beta_array.data(),
               reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
 }
+void cblasXsymm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
+                const size_t m, const size_t n,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                const half beta,
+                std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer);
+  cblasXsymm(layout, side, triangle,
+             m, n,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             b_buffer_bis, b_offset, b_ld,
+             HalfToFloat(beta),
+             c_buffer_bis, c_offset, c_ld);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for CHEMM/ZHEMM
 void cblasXhemm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle,
@@ -1497,6 +1830,22 @@ void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS
               beta_array.data(),
               reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
 }
+void cblasXsyrk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
+                const size_t n, const size_t k,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                const half beta,
+                std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer);
+  cblasXsyrk(layout, triangle, a_transpose,
+             n, k,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             HalfToFloat(beta),
+             c_buffer_bis, c_offset, c_ld);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for CHERK/ZHERK
 void cblasXherk(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose,
@@ -1591,6 +1940,25 @@ void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLA
                beta_array.data(),
                reinterpret_cast<double*>(&c_buffer[c_offset]), c_ld);
 }
+void cblasXsyr2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
+                 const size_t n, const size_t k,
+                 const half alpha,
+                 const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                 const std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                 const half beta,
+                 std::vector<half>& c_buffer, const size_t c_offset, const size_t c_ld) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer);
+  cblasXsyr2k(layout, triangle, ab_transpose,
+              n, k,
+              HalfToFloat(alpha),
+              a_buffer_bis, a_offset, a_ld,
+              b_buffer_bis, b_offset, b_ld,
+              HalfToFloat(beta),
+              c_buffer_bis, c_offset, c_ld);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for CHER2K/ZHER2K
 void cblasXher2k(const CBLAS_ORDER layout, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE ab_transpose,
@@ -1673,6 +2041,20 @@ void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
 }
+void cblasXtrmm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
+  cblasXtrmm(layout, side, triangle, a_transpose, diagonal,
+             m, n,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             b_buffer_bis, b_offset, b_ld);
+  FloatToHalfBuffer(b_buffer, b_buffer_bis);
+}
 
 // Forwards the Netlib BLAS calls for STRSM/DTRSM/CTRSM/ZTRSM
 void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
@@ -1721,6 +2103,20 @@ void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPL
               reinterpret_cast<const double*>(&a_buffer[a_offset]), a_ld,
               reinterpret_cast<double*>(&b_buffer[b_offset]), b_ld);
 }
+void cblasXtrsm(const CBLAS_ORDER layout, const CBLAS_SIDE side, const CBLAS_UPLO triangle, const CBLAS_TRANSPOSE a_transpose, const CBLAS_DIAG diagonal,
+                const size_t m, const size_t n,
+                const half alpha,
+                const std::vector<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                std::vector<half>& b_buffer, const size_t b_offset, const size_t b_ld) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer);
+  cblasXtrsm(layout, side, triangle, a_transpose, diagonal,
+             m, n,
+             HalfToFloat(alpha),
+             a_buffer_bis, a_offset, a_ld,
+             b_buffer_bis, b_offset, b_ld);
+  FloatToHalfBuffer(b_buffer, b_buffer_bis);
+}
 
 // =================================================================================================
 } // namespace clblast
diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.hpp
index 23c55373..3f33890a 100644
--- a/test/wrapper_clblas.h
+++ b/test/wrapper_clblas.hpp
@@ -17,7 +17,7 @@
 
 #include <clBLAS.h>
 
-#include "internal/utilities.h"
+#include "utilities.hpp"
 
 namespace clblast {
 
@@ -34,104 +34,104 @@ clblasSide convertToCLBLAS(const Side v) { return (v == Side::kLeft) ? clblasLef
 
 // Forwards the clBLAS calls for SROTG/DROTG
 template <typename T>
-clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset,
-                         cl_mem sb_buffer, const size_t sb_offset,
-                         cl_mem sc_buffer, const size_t sc_offset,
-                         cl_mem ss_buffer, const size_t ss_offset,
+clblasStatus clblasXrotg(Buffer<T>& sa_buffer, const size_t sa_offset,
+                         Buffer<T>& sb_buffer, const size_t sb_offset,
+                         Buffer<T>& sc_buffer, const size_t sc_offset,
+                         Buffer<T>& ss_buffer, const size_t ss_offset,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
-clblasStatus clblasXrotg<float>(cl_mem sa_buffer, const size_t sa_offset,
-                                cl_mem sb_buffer, const size_t sb_offset,
-                                cl_mem sc_buffer, const size_t sc_offset,
-                                cl_mem ss_buffer, const size_t ss_offset,
+clblasStatus clblasXrotg<float>(Buffer<float>& sa_buffer, const size_t sa_offset,
+                                Buffer<float>& sb_buffer, const size_t sb_offset,
+                                Buffer<float>& sc_buffer, const size_t sc_offset,
+                                Buffer<float>& ss_buffer, const size_t ss_offset,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-  return clblasSrotg(sa_buffer, sa_offset,
-                     sb_buffer, sb_offset,
-                     sc_buffer, sc_offset,
-                     ss_buffer, ss_offset,
+  return clblasSrotg(sa_buffer(), sa_offset,
+                     sb_buffer(), sb_offset,
+                     sc_buffer(), sc_offset,
+                     ss_buffer(), ss_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
-clblasStatus clblasXrotg<double>(cl_mem sa_buffer, const size_t sa_offset,
-                                 cl_mem sb_buffer, const size_t sb_offset,
-                                 cl_mem sc_buffer, const size_t sc_offset,
-                                 cl_mem ss_buffer, const size_t ss_offset,
+clblasStatus clblasXrotg<double>(Buffer<double>& sa_buffer, const size_t sa_offset,
+                                 Buffer<double>& sb_buffer, const size_t sb_offset,
+                                 Buffer<double>& sc_buffer, const size_t sc_offset,
+                                 Buffer<double>& ss_buffer, const size_t ss_offset,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-  return clblasDrotg(sa_buffer, sa_offset,
-                     sb_buffer, sb_offset,
-                     sc_buffer, sc_offset,
-                     ss_buffer, ss_offset,
+  return clblasDrotg(sa_buffer(), sa_offset,
+                     sb_buffer(), sb_offset,
+                     sc_buffer(), sc_offset,
+                     ss_buffer(), ss_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
 // Forwards the clBLAS calls for SROTMG/DROTMG
 template <typename T>
-clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset,
-                          cl_mem sd2_buffer, const size_t sd2_offset,
-                          cl_mem sx1_buffer, const size_t sx1_offset,
-                          const cl_mem sy1_buffer, const size_t sy1_offset,
-                          cl_mem sparam_buffer, const size_t sparam_offset,
+clblasStatus clblasXrotmg(Buffer<T>& sd1_buffer, const size_t sd1_offset,
+                          Buffer<T>& sd2_buffer, const size_t sd2_offset,
+                          Buffer<T>& sx1_buffer, const size_t sx1_offset,
+                          const Buffer<T>& sy1_buffer, const size_t sy1_offset,
+                          Buffer<T>& sparam_buffer, const size_t sparam_offset,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
-clblasStatus clblasXrotmg<float>(cl_mem sd1_buffer, const size_t sd1_offset,
-                                 cl_mem sd2_buffer, const size_t sd2_offset,
-                                 cl_mem sx1_buffer, const size_t sx1_offset,
-                                 const cl_mem sy1_buffer, const size_t sy1_offset,
-                                 cl_mem sparam_buffer, const size_t sparam_offset,
+clblasStatus clblasXrotmg<float>(Buffer<float>& sd1_buffer, const size_t sd1_offset,
+                                 Buffer<float>& sd2_buffer, const size_t sd2_offset,
+                                 Buffer<float>& sx1_buffer, const size_t sx1_offset,
+                                 const Buffer<float>& sy1_buffer, const size_t sy1_offset,
+                                 Buffer<float>& sparam_buffer, const size_t sparam_offset,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-  return clblasSrotmg(sd1_buffer, sd1_offset,
-                      sd2_buffer, sd2_offset,
-                      sx1_buffer, sx1_offset,
-                      sy1_buffer, sy1_offset,
-                      sparam_buffer, sparam_offset,
+  return clblasSrotmg(sd1_buffer(), sd1_offset,
+                      sd2_buffer(), sd2_offset,
+                      sx1_buffer(), sx1_offset,
+                      sy1_buffer(), sy1_offset,
+                      sparam_buffer(), sparam_offset,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
-clblasStatus clblasXrotmg<double>(cl_mem sd1_buffer, const size_t sd1_offset,
-                                  cl_mem sd2_buffer, const size_t sd2_offset,
-                                  cl_mem sx1_buffer, const size_t sx1_offset,
-                                  const cl_mem sy1_buffer, const size_t sy1_offset,
-                                  cl_mem sparam_buffer, const size_t sparam_offset,
+clblasStatus clblasXrotmg<double>(Buffer<double>& sd1_buffer, const size_t sd1_offset,
+                                  Buffer<double>& sd2_buffer, const size_t sd2_offset,
+                                  Buffer<double>& sx1_buffer, const size_t sx1_offset,
+                                  const Buffer<double>& sy1_buffer, const size_t sy1_offset,
+                                  Buffer<double>& sparam_buffer, const size_t sparam_offset,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
-  return clblasDrotmg(sd1_buffer, sd1_offset,
-                      sd2_buffer, sd2_offset,
-                      sx1_buffer, sx1_offset,
-                      sy1_buffer, sy1_offset,
-                      sparam_buffer, sparam_offset,
+  return clblasDrotmg(sd1_buffer(), sd1_offset,
+                      sd2_buffer(), sd2_offset,
+                      sx1_buffer(), sx1_offset,
+                      sy1_buffer(), sy1_offset,
+                      sparam_buffer(), sparam_offset,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 
 // Forwards the clBLAS calls for SROT/DROT
 clblasStatus clblasXrot(const size_t n,
-                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                         const float cos,
                         const float sin,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSrot(n,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    y_buffer, y_offset, static_cast<int>(y_inc),
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    y_buffer(), y_offset, static_cast<int>(y_inc),
                     cos,
                     sin,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXrot(const size_t n,
-                        cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                         const double cos,
                         const double sin,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDrot(n,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    y_buffer, y_offset, static_cast<int>(y_inc),
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    y_buffer(), y_offset, static_cast<int>(y_inc),
                     cos,
                     sin,
                     num_queues, queues, num_wait_events, wait_events, events);
@@ -140,316 +140,394 @@ clblasStatus clblasXrot(const size_t n,
 // Forwards the clBLAS calls for SROTM/DROTM
 template <typename T>
 clblasStatus clblasXrotm(const size_t n,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem sparam_buffer, const size_t sparam_offset,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<T>& sparam_buffer, const size_t sparam_offset,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXrotm<float>(const size_t n,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                                cl_mem sparam_buffer, const size_t sparam_offset,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                                Buffer<float>& sparam_buffer, const size_t sparam_offset,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSrotm(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     sparam_buffer, sparam_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     sparam_buffer(), sparam_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXrotm<double>(const size_t n,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                                 cl_mem sparam_buffer, const size_t sparam_offset,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                                 Buffer<double>& sparam_buffer, const size_t sparam_offset,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDrotm(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     sparam_buffer, sparam_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     sparam_buffer(), sparam_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
 // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP
 template <typename T>
 clblasStatus clblasXswap(const size_t n,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXswap<float>(const size_t n,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSswap(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXswap<double>(const size_t n,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDswap(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXswap<float2>(const size_t n,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCswap(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXswap<double2>(const size_t n,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                  cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                  Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZswap(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXswap<half>(const size_t n,
+                               Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXswap(n,
+                            x_buffer_bis, x_offset, x_inc,
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL
 clblasStatus clblasXscal(const size_t n,
                          const float alpha,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSscal(n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXscal(const size_t n,
                          const double alpha,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDscal(n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXscal(const size_t n,
                          const float2 alpha,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCscal(n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXscal(const size_t n,
                          const double2 alpha,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZscal(n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXscal(const size_t n,
+                         const half alpha,
+                         Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto status = clblasXscal(n,
+                            HalfToFloat(alpha),
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY
 template <typename T>
 clblasStatus clblasXcopy(const size_t n,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXcopy<float>(const size_t n,
-                                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasScopy(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXcopy<double>(const size_t n,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                 const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDcopy(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXcopy<float2>(const size_t n,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                 const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCcopy(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXcopy<double2>(const size_t n,
-                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                  cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                  const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                  Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZcopy(n,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXcopy<half>(const size_t n,
+                               const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXcopy(n,
+                            x_buffer_bis, x_offset, x_inc,
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY
 clblasStatus clblasXaxpy(const size_t n,
                          const float alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSaxpy(n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXaxpy(const size_t n,
                          const double alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDaxpy(n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXaxpy(const size_t n,
                          const float2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCaxpy(n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXaxpy(const size_t n,
                          const double2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZaxpy(n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXaxpy(const size_t n,
+                         const half alpha,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXaxpy(n,
+                            HalfToFloat(alpha),
+                            x_buffer_bis, x_offset, x_inc,
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SDOT/DDOT
 template <typename T>
 clblasStatus clblasXdot(const size_t n,
-                        cl_mem dot_buffer, const size_t dot_offset,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                        Buffer<T>& dot_buffer, const size_t dot_offset,
+                        const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        const Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXdot<float>(const size_t n,
-                               cl_mem dot_buffer, const size_t dot_offset,
-                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               Buffer<float>& dot_buffer, const size_t dot_offset,
+                               const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                                cl_uint num_queues, cl_command_queue *queues,
                                cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float>(context, n);
   return clblasSdot(n,
-                    dot_buffer, dot_offset,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    y_buffer, y_offset, static_cast<int>(y_inc),
+                    dot_buffer(), dot_offset,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    y_buffer(), y_offset, static_cast<int>(y_inc),
                     scratch_buffer(),
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXdot<double>(const size_t n,
-                                cl_mem dot_buffer, const size_t dot_offset,
-                                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                Buffer<double>& dot_buffer, const size_t dot_offset,
+                                const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double>(context, n);
   return clblasDdot(n,
-                    dot_buffer, dot_offset,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    y_buffer, y_offset, static_cast<int>(y_inc),
+                    dot_buffer(), dot_offset,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    y_buffer(), y_offset, static_cast<int>(y_inc),
                     scratch_buffer(),
                     num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXdot<half>(const size_t n,
+                              Buffer<half>& dot_buffer, const size_t dot_offset,
+                              const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                              const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_uint num_queues, cl_command_queue *queues,
+                              cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer, queues[0]);
+  auto status = clblasXdot(n,
+                           dot_buffer_bis, dot_offset,
+                           x_buffer_bis, x_offset, x_inc,
+                           y_buffer_bis, y_offset, y_inc,
+                           num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(dot_buffer, dot_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for CDOTU/ZDOTU
 template <typename T>
 clblasStatus clblasXdotu(const size_t n,
-                         cl_mem dot_buffer, const size_t dot_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<T>& dot_buffer, const size_t dot_offset,
+                         const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXdotu<float2>(const size_t n,
-                                 cl_mem dot_buffer, const size_t dot_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                 Buffer<float2>& dot_buffer, const size_t dot_offset,
+                                 const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float2>(context, n);
   return clblasCdotu(n,
-                     dot_buffer, dot_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     dot_buffer(), dot_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXdotu<double2>(const size_t n,
-                                  cl_mem dot_buffer, const size_t dot_offset,
-                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                  const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                  Buffer<double2>& dot_buffer, const size_t dot_offset,
+                                  const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double2>(context, n);
   return clblasZdotu(n,
-                     dot_buffer, dot_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     dot_buffer(), dot_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
@@ -457,42 +535,42 @@ clblasStatus clblasXdotu<double2>(const size_t n,
 // Forwards the clBLAS calls for CDOTC/ZDOTC
 template <typename T>
 clblasStatus clblasXdotc(const size_t n,
-                         cl_mem dot_buffer, const size_t dot_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<T>& dot_buffer, const size_t dot_offset,
+                         const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXdotc<float2>(const size_t n,
-                                 cl_mem dot_buffer, const size_t dot_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                 const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                 Buffer<float2>& dot_buffer, const size_t dot_offset,
+                                 const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float2>(context, n);
   return clblasCdotc(n,
-                     dot_buffer, dot_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     dot_buffer(), dot_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXdotc<double2>(const size_t n,
-                                  cl_mem dot_buffer, const size_t dot_offset,
-                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                                  const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                                  Buffer<double2>& dot_buffer, const size_t dot_offset,
+                                  const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double2>(context, n);
   return clblasZdotc(n,
-                     dot_buffer, dot_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     dot_buffer(), dot_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
@@ -500,206 +578,251 @@ clblasStatus clblasXdotc<double2>(const size_t n,
 // Forwards the clBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2
 template <typename T>
 clblasStatus clblasXnrm2(const size_t n,
-                         cl_mem nrm2_buffer, const size_t nrm2_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<T>& nrm2_buffer, const size_t nrm2_offset,
+                         const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXnrm2<float>(const size_t n,
-                                cl_mem nrm2_buffer, const size_t nrm2_offset,
-                                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                Buffer<float>& nrm2_buffer, const size_t nrm2_offset,
+                                const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float>(context, 2*n);
   return clblasSnrm2(n,
-                     nrm2_buffer, nrm2_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     nrm2_buffer(), nrm2_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXnrm2<double>(const size_t n,
-                                 cl_mem nrm2_buffer, const size_t nrm2_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<double>& nrm2_buffer, const size_t nrm2_offset,
+                                 const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double>(context, 2*n);
   return clblasDnrm2(n,
-                     nrm2_buffer, nrm2_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     nrm2_buffer(), nrm2_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXnrm2<float2>(const size_t n,
-                                 cl_mem nrm2_buffer, const size_t nrm2_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<float2>& nrm2_buffer, const size_t nrm2_offset,
+                                 const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float2>(context, 2*n);
   return clblasScnrm2(n,
-                     nrm2_buffer, nrm2_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     nrm2_buffer(), nrm2_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXnrm2<double2>(const size_t n,
-                                  cl_mem nrm2_buffer, const size_t nrm2_offset,
-                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  Buffer<double2>& nrm2_buffer, const size_t nrm2_offset,
+                                  const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double2>(context, 2*n);
   return clblasDznrm2(n,
-                     nrm2_buffer, nrm2_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     nrm2_buffer(), nrm2_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXnrm2<half>(const size_t n,
+                               Buffer<half>& nrm2_buffer, const size_t nrm2_offset,
+                               const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer, queues[0]);
+  auto status = clblasXnrm2(n,
+                            nrm2_buffer_bis, nrm2_offset,
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM
 template <typename T>
 clblasStatus clblasXasum(const size_t n,
-                         cl_mem asum_buffer, const size_t asum_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<T>& asum_buffer, const size_t asum_offset,
+                         const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXasum<float>(const size_t n,
-                                cl_mem asum_buffer, const size_t asum_offset,
-                                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                Buffer<float>& asum_buffer, const size_t asum_offset,
+                                const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float>(context, n);
   return clblasSasum(n,
-                     asum_buffer, asum_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     asum_buffer(), asum_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXasum<double>(const size_t n,
-                                 cl_mem asum_buffer, const size_t asum_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<double>& asum_buffer, const size_t asum_offset,
+                                 const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double>(context, n);
   return clblasDasum(n,
-                     asum_buffer, asum_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     asum_buffer(), asum_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXasum<float2>(const size_t n,
-                                 cl_mem asum_buffer, const size_t asum_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<float2>& asum_buffer, const size_t asum_offset,
+                                 const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float2>(context, n);
   return clblasScasum(n,
-                     asum_buffer, asum_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     asum_buffer(), asum_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXasum<double2>(const size_t n,
-                                  cl_mem asum_buffer, const size_t asum_offset,
-                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  Buffer<double2>& asum_buffer, const size_t asum_offset,
+                                  const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double2>(context, n);
   return clblasDzasum(n,
-                     asum_buffer, asum_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     asum_buffer(), asum_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXasum<half>(const size_t n,
+                               Buffer<half>& asum_buffer, const size_t asum_offset,
+                               const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer, queues[0]);
+  auto status = clblasXasum(n,
+                            asum_buffer_bis, asum_offset,
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(asum_buffer, asum_buffer_bis, queues[0]);
+  return status;
+}
 
-// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX
+// Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX
 template <typename T>
 clblasStatus clblasXamax(const size_t n,
-                         cl_mem imax_buffer, const size_t imax_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         Buffer<T>& imax_buffer, const size_t imax_offset,
+                         const Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXamax<float>(const size_t n,
-                                cl_mem imax_buffer, const size_t imax_offset,
-                                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                Buffer<float>& imax_buffer, const size_t imax_offset,
+                                const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float>(context, 2*n);
   return clblasiSamax(n,
-                     imax_buffer, imax_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     imax_buffer(), imax_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXamax<double>(const size_t n,
-                                 cl_mem imax_buffer, const size_t imax_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<double>& imax_buffer, const size_t imax_offset,
+                                 const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double>(context, 2*n);
   return clblasiDamax(n,
-                     imax_buffer, imax_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     imax_buffer(), imax_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXamax<float2>(const size_t n,
-                                 cl_mem imax_buffer, const size_t imax_offset,
-                                 const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 Buffer<float2>& imax_buffer, const size_t imax_offset,
+                                 const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<float2>(context, 2*n);
   return clblasiCamax(n,
-                     imax_buffer, imax_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     imax_buffer(), imax_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXamax<double2>(const size_t n,
-                                  cl_mem imax_buffer, const size_t imax_offset,
-                                  const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  Buffer<double2>& imax_buffer, const size_t imax_offset,
+                                  const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
   auto context = queue.GetContext();
   auto scratch_buffer = Buffer<double2>(context, 2*n);
   return clblasiZamax(n,
-                     imax_buffer, imax_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     imax_buffer(), imax_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXamax<half>(const size_t n,
+                               Buffer<half>& imax_buffer, const size_t imax_offset,
+                               const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]);
+  auto status = clblasXamax(n,
+                            imax_buffer_bis, imax_offset,
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]);
+  return status;
+}
 
 // =================================================================================================
 // BLAS level-2 (matrix-vector) routines
@@ -709,185 +832,231 @@ clblasStatus clblasXamax<double2>(const size_t n,
 clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSgemv(layout, a_transpose,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDgemv(layout, a_transpose,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCgemv(layout, a_transpose,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_float2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZgemv(layout, a_transpose,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_double2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose,
+                         const size_t m, const size_t n,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const half beta,
+                         Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXgemv(layout, a_transpose,
+                            m, n,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            x_buffer_bis, x_offset, x_inc,
+                            HalfToFloat(beta),
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV
 clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSgbmv(layout, a_transpose,
                      m, n, kl, ku,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDgbmv(layout, a_transpose,
                      m, n, kl, ku,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCgbmv(layout, a_transpose,
                      m, n, kl, ku,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_float2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose,
                          const size_t m, const size_t n, const size_t kl, const size_t ku,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZgbmv(layout, a_transpose,
                      m, n, kl, ku,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_double2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose,
+                         const size_t m, const size_t n, const size_t kl, const size_t ku,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const half beta,
+                         Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXgbmv(layout, a_transpose,
+                            m, n, kl, ku,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            x_buffer_bis, x_offset, x_inc,
+                            HalfToFloat(beta),
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for CHEMV/ZHEMV
 clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasChemv(layout, triangle,
                      n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_float2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZhemv(layout, triangle,
                      n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_double2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -895,37 +1064,37 @@ clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n, const size_t k,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasChbmv(layout, triangle,
                      n, k,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_float2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n, const size_t k,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZhbmv(layout, triangle,
                      n, k,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_double2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -933,37 +1102,37 @@ clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float2 alpha,
-                         const cl_mem ap_buffer, const size_t ap_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& ap_buffer, const size_t ap_offset,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasChpmv(layout, triangle,
                      n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_float2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double2 alpha,
-                         const cl_mem ap_buffer, const size_t ap_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& ap_buffer, const size_t ap_offset,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double2 beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZhpmv(layout, triangle,
                      n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      cl_double2{{beta.real(), beta.imag()}},
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -971,129 +1140,198 @@ clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsymv(layout, triangle,
                      n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsymv(layout, triangle,
                      n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle,
+                         const size_t n,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const half beta,
+                         Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXsymv(layout, triangle,
+                            n,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            x_buffer_bis, x_offset, x_inc,
+                            HalfToFloat(beta),
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSBMV/DSBMV
 clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n, const size_t k,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsbmv(layout, triangle,
                      n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n, const size_t k,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsbmv(layout, triangle,
                      n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle,
+                         const size_t n, const size_t k,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const half beta,
+                         Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXsbmv(layout, triangle,
+                            n, k,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            x_buffer_bis, x_offset, x_inc,
+                            HalfToFloat(beta),
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSPMV/DSPMV
 clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float alpha,
-                         const cl_mem ap_buffer, const size_t ap_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& ap_buffer, const size_t ap_offset,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const float beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSspmv(layout, triangle,
                      n,
                      alpha,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double alpha,
-                         const cl_mem ap_buffer, const size_t ap_offset,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& ap_buffer, const size_t ap_offset,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                          const double beta,
-                         cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDspmv(layout, triangle,
                      n,
                      alpha,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      beta,
-                     y_buffer, y_offset, static_cast<int>(y_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle,
+                         const size_t n,
+                         const half alpha,
+                         const Buffer<half>& ap_buffer, const size_t ap_offset,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const half beta,
+                         Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto status = clblasXspmv(layout, triangle,
+                            n,
+                            HalfToFloat(alpha),
+                            ap_buffer_bis, ap_offset,
+                            x_buffer_bis, x_offset, x_inc,
+                            HalfToFloat(beta),
+                            y_buffer_bis, y_offset, y_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV
 template <typename T>
 clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t n,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                 const size_t n,
-                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1101,16 +1339,16 @@ clblasStatus clblasXtrmv<float>(const clblasOrder layout, const clblasUplo trian
   auto scratch_buffer = Buffer<float>(context, n);
   return clblasStrmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1118,16 +1356,16 @@ clblasStatus clblasXtrmv<double>(const clblasOrder layout, const clblasUplo tria
   auto scratch_buffer = Buffer<double>(context, n);
   return clblasDtrmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1135,16 +1373,16 @@ clblasStatus clblasXtrmv<float2>(const clblasOrder layout, const clblasUplo tria
   auto scratch_buffer = Buffer<float2>(context, n);
   return clblasCtrmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                   const size_t n,
-                                  const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1152,25 +1390,42 @@ clblasStatus clblasXtrmv<double2>(const clblasOrder layout, const clblasUplo tri
   auto scratch_buffer = Buffer<double2>(context, n);
   return clblasZtrmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXtrmv<half>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
+                               const size_t n,
+                               const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                               Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto status = clblasXtrmv(layout, triangle, a_transpose, diagonal,
+                            n,
+                            a_buffer_bis, a_offset, a_ld,
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV
 template <typename T>
 clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t n, const size_t k,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                 const size_t n, const size_t k,
-                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1178,16 +1433,16 @@ clblasStatus clblasXtbmv<float>(const clblasOrder layout, const clblasUplo trian
   auto scratch_buffer = Buffer<float>(context, n);
   return clblasStbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n, const size_t k,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1195,16 +1450,16 @@ clblasStatus clblasXtbmv<double>(const clblasOrder layout, const clblasUplo tria
   auto scratch_buffer = Buffer<double>(context, n);
   return clblasDtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n, const size_t k,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1212,16 +1467,16 @@ clblasStatus clblasXtbmv<float2>(const clblasOrder layout, const clblasUplo tria
   auto scratch_buffer = Buffer<float2>(context, n);
   return clblasCtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                   const size_t n, const size_t k,
-                                  const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1229,25 +1484,42 @@ clblasStatus clblasXtbmv<double2>(const clblasOrder layout, const clblasUplo tri
   auto scratch_buffer = Buffer<double2>(context, n);
   return clblasZtbmv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXtbmv<half>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
+                               const size_t n, const size_t k,
+                               const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                               Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto status = clblasXtbmv(layout, triangle, a_transpose, diagonal,
+                            n, k,
+                            a_buffer_bis, a_offset, a_ld,
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV
 template <typename T>
 clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t n,
-                         const cl_mem ap_buffer, const size_t ap_offset,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& ap_buffer, const size_t ap_offset,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                 const size_t n,
-                                const cl_mem ap_buffer, const size_t ap_offset,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<float>& ap_buffer, const size_t ap_offset,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1255,16 +1527,16 @@ clblasStatus clblasXtpmv<float>(const clblasOrder layout, const clblasUplo trian
   auto scratch_buffer = Buffer<float>(context, n);
   return clblasStpmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem ap_buffer, const size_t ap_offset,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<double>& ap_buffer, const size_t ap_offset,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1272,16 +1544,16 @@ clblasStatus clblasXtpmv<double>(const clblasOrder layout, const clblasUplo tria
   auto scratch_buffer = Buffer<double>(context, n);
   return clblasDtpmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem ap_buffer, const size_t ap_offset,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& ap_buffer, const size_t ap_offset,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1289,16 +1561,16 @@ clblasStatus clblasXtpmv<float2>(const clblasOrder layout, const clblasUplo tria
   auto scratch_buffer = Buffer<float2>(context, n);
   return clblasCtpmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                   const size_t n,
-                                  const cl_mem ap_buffer, const size_t ap_offset,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& ap_buffer, const size_t ap_offset,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   auto queue = Queue(queues[0]);
@@ -1306,70 +1578,87 @@ clblasStatus clblasXtpmv<double2>(const clblasOrder layout, const clblasUplo tri
   auto scratch_buffer = Buffer<double2>(context, n);
   return clblasZtpmv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      scratch_buffer(),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+template <>
+clblasStatus clblasXtpmv<half>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
+                               const size_t n,
+                               const Buffer<half>& ap_buffer, const size_t ap_offset,
+                               Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_uint num_queues, cl_command_queue *queues,
+                               cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]);
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto status = clblasXtpmv(layout, triangle, a_transpose, diagonal,
+                            n,
+                            ap_buffer_bis, ap_offset,
+                            x_buffer_bis, x_offset, x_inc,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV
 template <typename T>
 clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t n,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXtrsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                 const size_t n,
-                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasStrsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtrsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDtrsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtrsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCtrsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtrsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                   const size_t n,
-                                  const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZtrsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1377,60 +1666,60 @@ clblasStatus clblasXtrsv<double2>(const clblasOrder layout, const clblasUplo tri
 template <typename T>
 clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t n, const size_t k,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXtbsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                 const size_t n, const size_t k,
-                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasStbsv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtbsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n, const size_t k,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDtbsv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtbsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n, const size_t k,
-                                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCtbsv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtbsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                   const size_t n, const size_t k,
-                                  const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZtbsv(layout, triangle, a_transpose, diagonal,
                      n, k,
-                     a_buffer, a_offset, a_ld,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     a_buffer(), a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1438,60 +1727,60 @@ clblasStatus clblasXtbsv<double2>(const clblasOrder layout, const clblasUplo tri
 template <typename T>
 clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t n,
-                         const cl_mem ap_buffer, const size_t ap_offset,
-                         cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<T>& ap_buffer, const size_t ap_offset,
+                         Buffer<T>& x_buffer, const size_t x_offset, const size_t x_inc,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events);
 template <>
 clblasStatus clblasXtpsv<float>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                 const size_t n,
-                                const cl_mem ap_buffer, const size_t ap_offset,
-                                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                const Buffer<float>& ap_buffer, const size_t ap_offset,
+                                Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
                                 cl_uint num_queues, cl_command_queue *queues,
                                 cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasStpsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtpsv<double>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem ap_buffer, const size_t ap_offset,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<double>& ap_buffer, const size_t ap_offset,
+                                 Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDtpsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtpsv<float2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                  const size_t n,
-                                 const cl_mem ap_buffer, const size_t ap_offset,
-                                 cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                 const Buffer<float2>& ap_buffer, const size_t ap_offset,
+                                 Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                  cl_uint num_queues, cl_command_queue *queues,
                                  cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCtpsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 template <>
 clblasStatus clblasXtpsv<double2>(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                                   const size_t n,
-                                  const cl_mem ap_buffer, const size_t ap_offset,
-                                  cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                                  const Buffer<double2>& ap_buffer, const size_t ap_offset,
+                                  Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
                                   cl_uint num_queues, cl_command_queue *queues,
                                   cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZtpsv(layout, triangle, a_transpose, diagonal,
                      n,
-                     ap_buffer, ap_offset,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
+                     ap_buffer(), ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1499,67 +1788,88 @@ clblasStatus clblasXtpsv<double2>(const clblasOrder layout, const clblasUplo tri
 clblasStatus clblasXger(const clblasOrder layout,
                         const size_t m, const size_t n,
                         const float alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                        Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSger(layout,
                     m, n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    y_buffer, y_offset, static_cast<int>(y_inc),
-                    a_buffer, a_offset, a_ld,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    y_buffer(), y_offset, static_cast<int>(y_inc),
+                    a_buffer(), a_offset, a_ld,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXger(const clblasOrder layout,
                         const size_t m, const size_t n,
                         const double alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                        Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDger(layout,
                     m, n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    y_buffer, y_offset, static_cast<int>(y_inc),
-                    a_buffer, a_offset, a_ld,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    y_buffer(), y_offset, static_cast<int>(y_inc),
+                    a_buffer(), a_offset, a_ld,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXger(const clblasOrder layout,
+                        const size_t m, const size_t n,
+                        const half alpha,
+                        const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                        Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_uint num_queues, cl_command_queue *queues,
+                        cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto status = clblasXger(layout,
+                           m, n,
+                           HalfToFloat(alpha),
+                           x_buffer_bis, x_offset, x_inc,
+                           y_buffer_bis, y_offset, y_inc,
+                           a_buffer_bis, a_offset, a_ld,
+                           num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for CGERU/ZGERU
 clblasStatus clblasXgeru(const clblasOrder layout,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCgeru(layout,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgeru(const clblasOrder layout,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZgeru(layout,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1567,33 +1877,33 @@ clblasStatus clblasXgeru(const clblasOrder layout,
 clblasStatus clblasXgerc(const clblasOrder layout,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCgerc(layout,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgerc(const clblasOrder layout,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZgerc(layout,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1601,29 +1911,29 @@ clblasStatus clblasXgerc(const clblasOrder layout,
 clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const float alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCher(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    a_buffer, a_offset, a_ld,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    a_buffer(), a_offset, a_ld,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const double alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZher(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    a_buffer, a_offset, a_ld,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    a_buffer(), a_offset, a_ld,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1631,29 +1941,29 @@ clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const float alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem ap_buffer, const size_t ap_offset,
+                        const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<float2>& ap_buffer, const size_t ap_offset,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasChpr(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    ap_buffer, ap_offset,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    ap_buffer(), ap_offset,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const double alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem ap_buffer, const size_t ap_offset,
+                        const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<double2>& ap_buffer, const size_t ap_offset,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZhpr(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    ap_buffer, ap_offset,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    ap_buffer(), ap_offset,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1661,33 +1971,33 @@ clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCher2(layout, triangle,
                      n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZher2(layout, triangle,
                      n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1695,33 +2005,33 @@ clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem ap_buffer, const size_t ap_offset,
+                         const Buffer<float2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float2>& ap_buffer, const size_t ap_offset,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasChpr2(layout, triangle,
                      n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     ap_buffer, ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     ap_buffer(), ap_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double2 alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem ap_buffer, const size_t ap_offset,
+                         const Buffer<double2>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double2>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double2>& ap_buffer, const size_t ap_offset,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZhpr2(layout, triangle,
                      n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     ap_buffer, ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     ap_buffer(), ap_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -1729,129 +2039,207 @@ clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const float alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsyr(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    a_buffer, a_offset, a_ld,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    a_buffer(), a_offset, a_ld,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const double alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsyr(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    a_buffer, a_offset, a_ld,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    a_buffer(), a_offset, a_ld,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle,
+                        const size_t n,
+                        const half alpha,
+                        const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                        cl_uint num_queues, cl_command_queue *queues,
+                        cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto status = clblasXsyr(layout, triangle,
+                           n,
+                           HalfToFloat(alpha),
+                           x_buffer_bis, x_offset, x_inc,
+                           a_buffer_bis, a_offset, a_ld,
+                           num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSPR/DSPR
 clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const float alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem ap_buffer, const size_t ap_offset,
+                        const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<float>& ap_buffer, const size_t ap_offset,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSspr(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    ap_buffer, ap_offset,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    ap_buffer(), ap_offset,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle,
                         const size_t n,
                         const double alpha,
-                        const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                        cl_mem ap_buffer, const size_t ap_offset,
+                        const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<double>& ap_buffer, const size_t ap_offset,
                         cl_uint num_queues, cl_command_queue *queues,
                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDspr(layout, triangle,
                     n,
                     alpha,
-                    x_buffer, x_offset, static_cast<int>(x_inc),
-                    ap_buffer, ap_offset,
+                    x_buffer(), x_offset, static_cast<int>(x_inc),
+                    ap_buffer(), ap_offset,
                     num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle,
+                        const size_t n,
+                        const half alpha,
+                        const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                        Buffer<half>& ap_buffer, const size_t ap_offset,
+                        cl_uint num_queues, cl_command_queue *queues,
+                        cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]);
+  auto status = clblasXspr(layout, triangle,
+                           n,
+                           HalfToFloat(alpha),
+                           x_buffer_bis, x_offset, x_inc,
+                           ap_buffer_bis, ap_offset,
+                           num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSYR2/DSYR2
 clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsyr2(layout, triangle,
                      n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsyr2(layout, triangle,
                      n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     a_buffer, a_offset, a_ld,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     a_buffer(), a_offset, a_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle,
+                         const size_t n,
+                         const half alpha,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto status = clblasXsyr2(layout, triangle,
+                            n,
+                            HalfToFloat(alpha),
+                            x_buffer_bis, x_offset, x_inc,
+                            y_buffer_bis, y_offset, y_inc,
+                            a_buffer_bis, a_offset, a_ld,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSPR2/DSPR2
 clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const float alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem ap_buffer, const size_t ap_offset,
+                         const Buffer<float>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<float>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<float>& ap_buffer, const size_t ap_offset,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSspr2(layout, triangle,
                      n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     ap_buffer, ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     ap_buffer(), ap_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle,
                          const size_t n,
                          const double alpha,
-                         const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
-                         const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
-                         cl_mem ap_buffer, const size_t ap_offset,
+                         const Buffer<double>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<double>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<double>& ap_buffer, const size_t ap_offset,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDspr2(layout, triangle,
                      n,
                      alpha,
-                     x_buffer, x_offset, static_cast<int>(x_inc),
-                     y_buffer, y_offset, static_cast<int>(y_inc),
-                     ap_buffer, ap_offset,
+                     x_buffer(), x_offset, static_cast<int>(x_inc),
+                     y_buffer(), y_offset, static_cast<int>(y_inc),
+                     ap_buffer(), ap_offset,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle,
+                         const size_t n,
+                         const half alpha,
+                         const Buffer<half>& x_buffer, const size_t x_offset, const size_t x_inc,
+                         const Buffer<half>& y_buffer, const size_t y_offset, const size_t y_inc,
+                         Buffer<half>& ap_buffer, const size_t ap_offset,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]);
+  auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]);
+  auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]);
+  auto status = clblasXspr2(layout, triangle,
+                            n,
+                            HalfToFloat(alpha),
+                            x_buffer_bis, x_offset, x_inc,
+                            y_buffer_bis, y_offset, y_inc,
+                            ap_buffer_bis, ap_offset,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]);
+  return status;
+}
 
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
@@ -1861,185 +2249,231 @@ clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle,
 clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose,
                          const size_t m, const size_t n, const size_t k,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const float beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSgemm(layout, a_transpose, b_transpose,
                      m, n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose,
                          const size_t m, const size_t n, const size_t k,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const double beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDgemm(layout, a_transpose, b_transpose,
                      m, n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose,
                          const size_t m, const size_t n, const size_t k,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const float2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCgemm(layout, a_transpose, b_transpose,
                      m, n, k,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      cl_float2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose,
                          const size_t m, const size_t n, const size_t k,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const double2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZgemm(layout, a_transpose, b_transpose,
                      m, n, k,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      cl_double2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose,
+                         const size_t m, const size_t n, const size_t k,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                         const half beta,
+                         Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]);
+  auto status = clblasXgemm(layout, a_transpose, b_transpose,
+                            m, n, k,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            b_buffer_bis, b_offset, b_ld,
+                            HalfToFloat(beta),
+                            c_buffer_bis, c_offset, c_ld,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM
 clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
                          const size_t m, const size_t n,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const float beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsymm(layout, side, triangle,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
                          const size_t m, const size_t n,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const double beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsymm(layout, side, triangle,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const float2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCsymm(layout, side, triangle,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      cl_float2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const double2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZsymm(layout, side, triangle,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      cl_double2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
+                         const size_t m, const size_t n,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                         const half beta,
+                         Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]);
+  auto status = clblasXsymm(layout, side, triangle,
+                            m, n,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            b_buffer_bis, b_offset, b_ld,
+                            HalfToFloat(beta),
+                            c_buffer_bis, c_offset, c_ld,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for CHEMM/ZHEMM
 clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const float2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasChemm(layout, side, triangle,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      cl_float2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          const double2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZhemm(layout, side, triangle,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      cl_double2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -2047,99 +2481,119 @@ clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const
 clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
                          const size_t n, const size_t k,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
                          const float beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsyrk(layout, triangle, a_transpose,
                      n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
+                     a_buffer(), a_offset, a_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
                          const size_t n, const size_t k,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
                          const double beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsyrk(layout, triangle, a_transpose,
                      n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
+                     a_buffer(), a_offset, a_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
                          const size_t n, const size_t k,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          const float2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCsyrk(layout, triangle, a_transpose,
                      n, k,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
+                     a_buffer(), a_offset, a_ld,
                      cl_float2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
                          const size_t n, const size_t k,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          const double2 beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZsyrk(layout, triangle, a_transpose,
                      n, k,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
+                     a_buffer(), a_offset, a_ld,
                      cl_double2{{beta.real(), beta.imag()}},
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
+                         const size_t n, const size_t k,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         const half beta,
+                         Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]);
+  auto status = clblasXsyrk(layout, triangle, a_transpose,
+                            n, k,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            HalfToFloat(beta),
+                            c_buffer_bis, c_offset, c_ld,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for CHERK/ZHERK
 clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
                          const size_t n, const size_t k,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          const float beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCherk(layout, triangle, a_transpose,
                      n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
+                     a_buffer(), a_offset, a_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose,
                          const size_t n, const size_t k,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
                          const double beta,
-                         cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                         Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZherk(layout, triangle, a_transpose,
                      n, k,
                      alpha,
-                     a_buffer, a_offset, a_ld,
+                     a_buffer(), a_offset, a_ld,
                      beta,
-                     c_buffer, c_offset, c_ld,
+                     c_buffer(), c_offset, c_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -2147,111 +2601,134 @@ clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, co
 clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
                           const size_t n, const size_t k,
                           const float alpha,
-                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                          const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld,
                           const float beta,
-                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                          Buffer<float>& c_buffer, const size_t c_offset, const size_t c_ld,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasSsyr2k(layout, triangle, ab_transpose,
                       n, k,
                       alpha,
-                      a_buffer, a_offset, a_ld,
-                      b_buffer, b_offset, b_ld,
+                      a_buffer(), a_offset, a_ld,
+                      b_buffer(), b_offset, b_ld,
                       beta,
-                      c_buffer, c_offset, c_ld,
+                      c_buffer(), c_offset, c_ld,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
                           const size_t n, const size_t k,
                           const double alpha,
-                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                          const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld,
                           const double beta,
-                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                          Buffer<double>& c_buffer, const size_t c_offset, const size_t c_ld,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDsyr2k(layout, triangle, ab_transpose,
                       n, k,
                       alpha,
-                      a_buffer, a_offset, a_ld,
-                      b_buffer, b_offset, b_ld,
+                      a_buffer(), a_offset, a_ld,
+                      b_buffer(), b_offset, b_ld,
                       beta,
-                      c_buffer, c_offset, c_ld,
+                      c_buffer(), c_offset, c_ld,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
                           const size_t n, const size_t k,
                           const float2 alpha,
-                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                          const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                           const float2 beta,
-                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                          Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCsyr2k(layout, triangle, ab_transpose,
                       n, k,
                       cl_float2{{alpha.real(), alpha.imag()}},
-                      a_buffer, a_offset, a_ld,
-                      b_buffer, b_offset, b_ld,
+                      a_buffer(), a_offset, a_ld,
+                      b_buffer(), b_offset, b_ld,
                       cl_float2{{beta.real(), beta.imag()}},
-                      c_buffer, c_offset, c_ld,
+                      c_buffer(), c_offset, c_ld,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
                           const size_t n, const size_t k,
                           const double2 alpha,
-                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                          const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                           const double2 beta,
-                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                          Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZsyr2k(layout, triangle, ab_transpose,
                       n, k,
                       cl_double2{{alpha.real(), alpha.imag()}},
-                      a_buffer, a_offset, a_ld,
-                      b_buffer, b_offset, b_ld,
+                      a_buffer(), a_offset, a_ld,
+                      b_buffer(), b_offset, b_ld,
                       cl_double2{{beta.real(), beta.imag()}},
-                      c_buffer, c_offset, c_ld,
+                      c_buffer(), c_offset, c_ld,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
+                          const size_t n, const size_t k,
+                          const half alpha,
+                          const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                          const half beta,
+                          Buffer<half>& c_buffer, const size_t c_offset, const size_t c_ld,
+                          cl_uint num_queues, cl_command_queue *queues,
+                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
+  auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]);
+  auto status = clblasXsyr2k(layout, triangle, ab_transpose,
+                             n, k,
+                             HalfToFloat(alpha),
+                             a_buffer_bis, a_offset, a_ld,
+                             b_buffer_bis, b_offset, b_ld,
+                             HalfToFloat(beta),
+                             c_buffer_bis, c_offset, c_ld,
+                             num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for CHER2K/ZHER2K
 clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
                           const size_t n, const size_t k,
                           const float2 alpha,
-                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                          const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                           const float beta,
-                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                          Buffer<float2>& c_buffer, const size_t c_offset, const size_t c_ld,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCher2k(layout, triangle, ab_transpose,
                       n, k,
                       cl_float2{{alpha.real(), alpha.imag()}},
-                      a_buffer, a_offset, a_ld,
-                      b_buffer, b_offset, b_ld,
+                      a_buffer(), a_offset, a_ld,
+                      b_buffer(), b_offset, b_ld,
                       beta,
-                      c_buffer, c_offset, c_ld,
+                      c_buffer(), c_offset, c_ld,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose,
                           const size_t n, const size_t k,
                           const double2 alpha,
-                          const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                          const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                          const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                           const double beta,
-                          cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                          Buffer<double2>& c_buffer, const size_t c_offset, const size_t c_ld,
                           cl_uint num_queues, cl_command_queue *queues,
                           cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZher2k(layout, triangle, ab_transpose,
                       n, k,
                       cl_double2{{alpha.real(), alpha.imag()}},
-                      a_buffer, a_offset, a_ld,
-                      b_buffer, b_offset, b_ld,
+                      a_buffer(), a_offset, a_ld,
+                      b_buffer(), b_offset, b_ld,
                       beta,
-                      c_buffer, c_offset, c_ld,
+                      c_buffer(), c_offset, c_ld,
                       num_queues, queues, num_wait_events, wait_events, events);
 }
 
@@ -2259,117 +2736,153 @@ clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, c
 clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasStrmm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDtrmm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCtrmm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZtrmm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
+                         const size_t m, const size_t n,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
+  auto status = clblasXtrmm(layout, side, triangle, a_transpose, diagonal,
+                            m, n,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            b_buffer_bis, b_offset, b_ld,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]);
+  return status;
+}
 
 // Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM
 clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const float alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<float>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasStrsm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const double alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<double>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasDtrsm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      alpha,
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const float2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<float2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<float2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasCtrsm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      cl_float2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
 clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
                          const size_t m, const size_t n,
                          const double2 alpha,
-                         const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
-                         cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                         const Buffer<double2>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<double2>& b_buffer, const size_t b_offset, const size_t b_ld,
                          cl_uint num_queues, cl_command_queue *queues,
                          cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
   return clblasZtrsm(layout, side, triangle, a_transpose, diagonal,
                      m, n,
                      cl_double2{{alpha.real(), alpha.imag()}},
-                     a_buffer, a_offset, a_ld,
-                     b_buffer, b_offset, b_ld,
+                     a_buffer(), a_offset, a_ld,
+                     b_buffer(), b_offset, b_ld,
                      num_queues, queues, num_wait_events, wait_events, events);
 }
+clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal,
+                         const size_t m, const size_t n,
+                         const half alpha,
+                         const Buffer<half>& a_buffer, const size_t a_offset, const size_t a_ld,
+                         Buffer<half>& b_buffer, const size_t b_offset, const size_t b_ld,
+                         cl_uint num_queues, cl_command_queue *queues,
+                         cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) {
+  auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]);
+  auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]);
+  auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal,
+                            m, n,
+                            HalfToFloat(alpha),
+                            a_buffer_bis, a_offset, a_ld,
+                            b_buffer_bis, b_offset, b_ld,
+                            num_queues, queues, num_wait_events, wait_events, events);
+  FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]);
+  return status;
+}
 
 // =================================================================================================
 } // namespace clblast
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-06-28 22:32:25 +0200
committer	GitHub <noreply@github.com>	2016-06-28 22:32:25 +0200
commit	7c13bacf129291e3e295ecb6e833788477085fa0 (patch)
tree	d114eeca418444d0b1c70cc9cce983de041235c9
parent	181eb20bbf15cf11baaf6112b6965050c49dd543 (diff)
parent	577f0ee1179014ece853af39d6f0ff0c87316eb3 (diff)