Merge pull request #1 from CNugteren/master

fork up to date
author: mcian <mcian86@gmail.com> 2017-07-31 14:02:12 +0200
committer: GitHub <noreply@github.com> 2017-07-31 14:02:12 +0200
commit: dc499065424af5dbf29a85bc6cbf4db9cd565cfd (patch)
tree: 3a7f9e86d30a1c700ae264caf46f0187c1e13177
parent: f2477f663672fd37301d6e2ce4646519f71d5cce (diff)
parent: 1155c068e982b1af19230c4c2d2e6dcb1d495414 (diff)
38 files changed, 421 insertions, 190 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index a7742fa5..adb1860b 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -58,8 +58,8 @@ build_script:
 
 after_build:
   - ps: pushd $env:CLBLAST_BUILD
-  - 7z a CLBlast-Windows-x64.zip .\install_dir\*
-  - ps: mv CLBlast-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLBlast-1.0.0-Windows-x64.zip .\install_dir\*
+  - ps: mv CLBlast-1.0.0-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
 
 artifacts:
   - path: '*.zip'
diff --git a/.travis.yml b/.travis.yml
index f628bb94..abd39aac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,10 @@ matrix:
 
 env:
   global:
+    - CLBLAST_VERSION=1.0.0
     - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
+    - CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
+    - CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
 
 before_install:
   - cmake --version;
@@ -29,12 +32,24 @@ before_install:
   - ${CXX} --version;
 
 before_script:
+  - mkdir -p ${CLBLAST_INSTALL}
   - mkdir -p ${CLBLAST_ROOT}
   - pushd ${CLBLAST_ROOT}
-  - cmake -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON -DNETLIB=ON ${TRAVIS_BUILD_DIR}
+  - cmake -DTESTS=ON -DCLIENTS=ON -DSAMPLES=ON -DCMAKE_INSTALL_PREFIX=${CLBLAST_INSTALL} ${TRAVIS_BUILD_DIR}
 
 script:
   - make
+  - make install
+
+after_success:
+  - pushd ${TRAVIS_BUILD_DIR}/bin
+  - rm ${CLBLAST_INSTALL}/bin/clblast_client_*
+  - rm ${CLBLAST_INSTALL}/bin/clblast_test_*
+  - echo ${CLBLAST_TAR}
+  - tar -cvf ${CLBLAST_TAR} CLBlast-${CLBLAST_VERSION}
+  - cp ${CLBLAST_TAR} ${TRAVIS_BUILD_DIR}
+  - pushd ${TRAVIS_BUILD_DIR}
+  - ls -l
 
 branches:
   only:
@@ -42,3 +57,13 @@ branches:
 
 notifications:
   email: false
+
+deploy:
+  provider: releases
+  api_key:
+    secure: oBnP56zfFTiON0v6nm6qiRevtTsojqaxV2E/+ahUP4iyZxZgn1zf9reGNEbB/s6wfHCwXpXKlCk3A0cEQzbfoZeQy3oMzyWHV/xgu+etOENe3z18oVEiVBe/WAd1/hMVmQvX65kHR+q12rce6K6rDm1mEIJC/udf5Dbdl2alVWgiL20Hrj/PSQAYZZuTmZLuMm7OBc1G2xhRmRo5FYgI2u1ZALUHDRov/yLQkoKwxAlzBhURoNTHW2wTAr3Pq01Fk2kfQFRmg7YFieu3cit/JGNzaDdgmT0U5pLRzhuPiD3qziNnC3rG7tnYV0jHQOLKH+AJ0csbNncG47JrUQrKDJGUs0fLBxHG4ErEdVc/s+l/ZTGBT6kOEjk5GLQviNuAzP51em+TATR6YJ4JdgnZEU3iwbyeY/lLPPWhOVDfUgLNVKHX7Sijf83Wp+cqspAdIcnT5lWMXUe7jciKQLC0B+jD6IQ/hCqF0/yX/H8Sa8jA+qSIrXWt/qSy1viKaQ3Sf8+rXyxG6dqYc0jUweQ248FOgUCtzmaZP48SoMBATN7JPCLzhGnY8IiMErGzc6jsevmoqB0MRqZhc2qsLEfTclxsMmfx2yVKt93G+zRMtQuYmf36MvDNbPaH+/tzE8pWxufSY0672qhL0sfvNO+FuCJ8Bsk4UwKV3lTeGjCwN5o=
+  file: ${CLBLAST_TAR}
+  skip_cleanup: true
+  on:
+    repo: CNugteren/CLBlast
+    tags: true
diff --git a/CHANGELOG b/CHANGELOG
index e7fca58c..7e5ebe9c 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,9 +1,10 @@
 
-Development (next version)
+Version 1.0.0
 - Fixed a bug in the TRSM routine for alpha != 1
 - Fixed a bug in the cache related to multi-device contexts (thanks to 'kpot')
 - Fixed a bug in the direct version of the GEMM kernel
 - Fixed several warnings for MSVC and Clang
+- Added support for Mesa Clover and AMD's ROCm by making the inline keyword optional in kernels
 - Performance reports are now external at https://cnugteren.github.io/clblast
 - Greatly improved compilation time of database.cpp
 - Various minor fixes and enhancements
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 595c6b89..1c7120f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,8 +17,8 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 
 # CMake project details
 project("clblast" C CXX)
-set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 11)
+set(clblast_VERSION_MAJOR 1)
+set(clblast_VERSION_MINOR 0)
 set(clblast_VERSION_PATCH 0)
 
 # Options and their default values
@@ -101,8 +101,10 @@ else()
     set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
     set(FLAGS "${FLAGS} -Wno-exit-time-destructors -Wno-global-constructors -Wno-missing-noreturn")
     set(FLAGS "${FLAGS} -Wno-deprecated-declarations")
-    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.9.0)
-      set(FLAGS "${FLAGS} -Wno-undefined-var-template")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 3.9.0)  # clang 4.0 or higher
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)  # but not for AppleClang
+          set(FLAGS "${FLAGS} -Wno-undefined-var-template")
+        endif()
     endif()
   endif()
 endif()
diff --git a/README.md b/README.md
index 45b5ab56..2386cc88 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,15 @@
 CLBlast: The tuned OpenCL BLAS library
 ================
 
-| | master branch |
-|-----|-----|
-| Linux/OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |
-| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) |
+| | Build status | Tests on Intel GPU | Tests on NVIDIA GPU | Tests on AMD GPU |
+|-----|-----|-----|-----|-----|
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Windows-AMD/) |
+| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-Intel/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-NVIDIA/) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-Linux-AMD/) |
+| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/badge/icon)](http://ci.arrayfire.org/view/Other/job/other/job/CLBlast-OSX-Intel/) | N/A | N/A |
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.
 
-This preview-version is not yet tuned for all OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library.
+The library is not tuned for all possible OpenCL devices: __if out-of-the-box performance is poor, please run the tuners first__. See below for a list of already tuned devices and instructions on how to tune yourself and contribute to future releases of the CLBlast library.
 
 
 Why CLBlast and not clBLAS or cuBLAS?
@@ -56,6 +57,7 @@ The pre-requisites for compilation of CLBlast are:
   - AMD APP SDK
   - Intel OpenCL
   - Beignet
+  - Mesa Clover
 
 An example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder):
 
@@ -97,7 +99,7 @@ To get started quickly, a couple of stand-alone example programs are included in
 
     cmake -DSAMPLES=ON ..
 
-There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
+There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severely. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows after providing the `-DNETLIB=ON` flag to CMake:
 
     #include <clblast_netlib_c.h>
 
@@ -154,6 +156,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
   - Core i7-5930K
 * Other devices:
   - ARM Mali-T628 GPU
+  - Qualcomm Adreno 330 GPU
   - Intel MIC
 
 If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should compile the library with the optional tuners by specifing `-DTUNERS=ON`, for example as follows:
@@ -279,7 +282,7 @@ CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
 | xTRSM    | ✔ | ✔ | ✔ | ✔ |   | (experimental, un-optimized)
 
-Futhermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
+Furthermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
 
 | Batched      | S | D | C | Z | H |
 | -------------|---|---|---|---|---|
@@ -302,7 +305,7 @@ Some less commonly used BLAS routines are not yet supported yet by CLBlast. They
 Half precision (fp16)
 -------------
 
-The half-precison fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
+The half-precision fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
 
 Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions:
 
@@ -317,7 +320,7 @@ Contributing
 
 Contributions are welcome in the form of tuning results for OpenCL devices previously untested or pull requests. See [the contributing guidelines](CONTRIBUTING.md) for more details.
 
-The contributing authors (code, pull requests, testing) so far are:
+The main contributing authors (code, pull requests, testing) are:
 
 * [Cedric Nugteren](http://cnugteren.github.io) - main author
 * [Anton Lokhmotov](https://github.com/psyhtest)
@@ -328,6 +331,8 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Ivan Shapovalov](https://github.com/intelfx)
 * [Dimitri Van Assche](https://github.com/dvasschemacq)
 * [Shehzan Mohammed](https://shehzan10.github.io)
+* [Marco Cianfriglia](https://github.com/mcian)
+* Everyone else listed as a [GitHub contributor](https://github.com/CNugteren/CLBlast/graphs/contributors)
 
 Tuning and testing on a variety of OpenCL devices was made possible by:
 
@@ -336,6 +341,13 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
 * [dividiti](http://www.dividiti.com)
 * [SURFsara HPC center](http://www.surfsara.com)
 * [ArrayFire](http://arrayfire.org)
+* Everyone reporting [tuning results](https://github.com/CNugteren/CLBlast/issues/1)
+
+Hardware/software for this project was contributed by:
+
+* [ArrayFire](http://arrayfire.org) for settings up and supporting Jenkins CI correctness tests on 7 platforms
+* [JetBrains](https://www.jetbrains.com/clion/) for supply a free CLion IDE license for CLBlast developers
+* [Travis CI](https://travis-ci.org/CNugteren/CLBlast/branches) and [AppVeyor](https://ci.appveyor.com/project/CNugteren/clblast) for free automated build tests for open-source projects
 
 
 More information
@@ -343,7 +355,7 @@ More information
 
 Further information on CLBlast is available through the following links:
 
-* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slideset is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
+* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
 * More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017). For CLTune, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
 
 
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
index 44ef0d32..82c7d59d 100644
--- a/src/database/kernel_selection.hpp
+++ b/src/database/kernel_selection.hpp
@@ -52,6 +52,11 @@ const Database::DatabaseEntry KernelSelectionSingle = {
         { "default",                                         { 1280*1280*1280 } },
       }
     },
+    { 
+      kDeviceTypeGPU, "ARM", {
+        { "default",                                         { 128*128*128} },
+      }
+    }, 
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 512*512*512 } },
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 259f95c3..e5defb32 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -30,6 +30,12 @@ const Database::DatabaseEntry CopyHalf = {
         { "default",                                         { 8, 32, 4, 8 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 8, 1 } },
+        { "default",                                         { 32, 8, 8, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 16, 8, 4, 4 } },
@@ -113,6 +119,12 @@ const Database::DatabaseEntry CopySingle = {
         { "default",                                         { 8, 32, 4, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 8, 1 } },
+        { "default",                                         { 32, 8, 8, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 32, 8, 4, 4 } },
@@ -187,6 +199,12 @@ const Database::DatabaseEntry CopyComplexSingle = {
         { "default",                                         { 32, 8, 1, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 1, 1 } },
+        { "default",                                         { 32, 8, 1, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 16, 8, 1, 2 } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index f925d07d..b6ebde43 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry PadHalf = {
         { "default",                                         { 8, 8, 2, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 16, 8, 4, 2 } },
+        { "default",                                         { 16, 8, 4, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 8, 8, 2, 1 } },
+        { "default",                                         { 8, 8, 4, 1 } },
       }
     },
   }
@@ -113,6 +119,12 @@ const Database::DatabaseEntry PadSingle = {
         { "default",                                         { 32, 8, 4, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 2, 1 } },
+        { "default",                                         { 32, 8, 2, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 32, 8, 2, 1 } },
@@ -195,9 +207,15 @@ const Database::DatabaseEntry PadComplexSingle = {
         { "default",                                         { 32, 8, 1, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 8, 4, 1 } },
+        { "default",                                         { 32, 8, 4, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 32, 8, 1, 2 } },
+        { "default",                                         { 32, 8, 1, 1 } },
       }
     },
   }
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index b80a1666..bbda5c65 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -30,6 +30,12 @@ const Database::DatabaseEntry PadtransposeHalf = {
         { "default",                                         { 0, 8, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 0, 8, 8 } },
+        { "default",                                         { 0, 8, 8 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 0, 8, 1 } },
@@ -112,6 +118,12 @@ const Database::DatabaseEntry PadtransposeSingle = {
         { "default",                                         { 1, 32, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 0, 8, 2 } },
+        { "default",                                         { 0, 8, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 1, 16, 2 } },
@@ -194,9 +206,15 @@ const Database::DatabaseEntry PadtransposeComplexSingle = {
         { "default",                                         { 1, 16, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 0, 8, 4 } },
+        { "default",                                         { 0, 8, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 1, 16, 2 } },
+        { "default",                                         { 1, 8, 2 } },
       }
     },
   }
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 446b632c..b00a23dc 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry TransposeHalf = {
         { "default",                                         { 8, 1, 0, 8 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 8, 0, 0, 4 } },
+        { "default",                                         { 8, 0, 0, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 16, 0, 1, 4 } },
+        { "default",                                         { 8, 0, 1, 8 } },
       }
     },
   }
@@ -113,6 +119,12 @@ const Database::DatabaseEntry TransposeSingle = {
         { "default",                                         { 8, 1, 0, 4 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 8, 1, 1, 4 } },
+        { "default",                                         { 8, 1, 1, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 8, 0, 1, 4 } },
@@ -189,6 +201,12 @@ const Database::DatabaseEntry TransposeComplexSingle = {
         { "default",                                         { 16, 1, 0, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 16, 1, 0, 1 } },
+        { "default",                                         { 16, 1, 0, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 8, 1, 1, 2 } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 58cde9d3..5cb225d1 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry XaxpyHalf = {
         { "default",                                         { 8, 64, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 8, 64, 1 } },
+        { "default",                                         { 8, 64, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 8, 256, 4 } },
+        { "default",                                         { 8, 64, 1 } },
       }
     },
   }
@@ -113,9 +119,15 @@ const Database::DatabaseEntry XaxpySingle = {
         { "default",                                         { 4, 1024, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 4, 128, 2 } },
+        { "default",                                         { 4, 128, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 4, 256, 1 } },
+        { "default",                                         { 4, 64, 1 } },
       }
     },
   }
@@ -195,6 +207,12 @@ const Database::DatabaseEntry XaxpyComplexSingle = {
         { "default",                                         { 1, 256, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 1, 64, 1 } },
+        { "default",                                         { 1, 64, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 1, 128, 1 } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index d234c558..986c32b2 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -30,9 +30,15 @@ const Database::DatabaseEntry XdotHalf = {
         { "default",                                         { 128, 32 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 64 } },
+        { "default",                                         { 64, 64 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 128, 32 } },
+        { "default",                                         { 128, 64 } },
       }
     },
   }
@@ -95,6 +101,12 @@ const Database::DatabaseEntry XdotSingle = {
         { "default",                                         { 256, 64 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 128, 64 } },
+        { "default",                                         { 128, 64 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 128, 32 } },
@@ -159,6 +171,12 @@ const Database::DatabaseEntry XdotComplexSingle = {
         { "default",                                         { 512, 64 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 256 } },
+        { "default",                                         { 64, 256 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 256, 32 } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 2270dd44..43854afb 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -112,9 +112,15 @@ const Database::DatabaseEntry XgemmSingle = {
         { "default",                                         { 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 1 } },
+        { "default",                                         { 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
+        { "default",                                         { 32, 2, 8, 8, 32, 8, 8, 32, 1, 1, 0, 0, 4, 2 } },
       }
     },
   }
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
index 7a1cd983..acace63f 100644
--- a/src/database/kernels/xgemm_direct.hpp
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -77,9 +77,15 @@ const Database::DatabaseEntry XgemmDirectSingle = {
         { "default",                                         { 2, 8, 8, 16, 16, 1, 1, 4, 2, 32 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 2, 8, 8, 8, 8, 1, 1, 2, 1, 16 } },
+        { "default",                                         { 2, 8, 8, 8, 8, 1, 1, 2, 1, 16 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { 2, 8, 8, 8, 8, 1, 1, 4, 2, 32 } },
+        { "default",                                         { 2, 8, 8, 8, 8, 1, 1, 1, 2, 16 } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 7adb6f10..c537294a 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -106,6 +106,12 @@ const Database::DatabaseEntry XgemvSingle = {
         { "default",                                         { 256, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 1 } },
+        { "default",                                         { 64, 1 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 128, 1 } },
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 8c42aa0e..c3b9103a 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -106,6 +106,12 @@ const Database::DatabaseEntry XgemvFastSingle = {
         { "default",                                         { 1, 256, 1 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 1, 64, 4 } },
+        { "default",                                         { 1, 64, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 1, 64, 1 } },
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 644498e2..7e5905e4 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -82,6 +82,12 @@ const Database::DatabaseEntry XgemvFastRotSingle = {
         { "default",                                         { 8, 32, 32 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 4, 64, 16 } },
+        { "default",                                         { 4, 64, 16 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 8, 32, 32 } },
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index d294ab43..e17396f6 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -30,6 +30,12 @@ const Database::DatabaseEntry XgerHalf = {
         { "default",                                         { 4, 8, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 4, 2 } },
+        { "default",                                         { 64, 4, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 64, 1, 2 } },
@@ -101,6 +107,12 @@ const Database::DatabaseEntry XgerSingle = {
         { "default",                                         { 128, 1, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 128, 1, 2 } },
+        { "default",                                         { 128, 1, 2 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 32, 4, 2 } },
@@ -171,6 +183,12 @@ const Database::DatabaseEntry XgerComplexSingle = {
         { "default",                                         { 128, 2, 2 } },
       }
     },
+    { // QUALCOMM GPUs
+      kDeviceTypeGPU, "QUALCOMM", {
+        { "QUALCOMM Adreno(TM)",                             { 64, 1, 4 } },
+        { "default",                                         { 64, 1, 4 } },
+      }
+    },
     { // Default
       kDeviceTypeAll, "default", {
         { "default",                                         { 64, 2, 2 } },
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index db4c8ec4..9481881e 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -235,6 +235,15 @@ R"(
 
 // =================================================================================================
 
+// Force inlining functions or not: some compilers don't support the inline keyword
+#ifdef USE_INLINE_KEYWORD
+  #define INLINE_FUNC inline
+#else
+  #define INLINE_FUNC
+#endif
+
+// =================================================================================================
+
 // Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is
 // enabled (see src/routine.cc).
 #ifndef USE_STAGGERED_INDICES
@@ -245,18 +254,18 @@ R"(
 // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
 // More details: https://github.com/CNugteren/CLBlast/issues/53
 #if USE_STAGGERED_INDICES == 1
-  inline size_t GetGroupIDFlat() {
+  INLINE_FUNC size_t GetGroupIDFlat() {
     return get_group_id(0) + get_num_groups(0) * get_group_id(1);
   }
-  inline size_t GetGroupID1() {
+  INLINE_FUNC size_t GetGroupID1() {
     return (GetGroupIDFlat()) % get_num_groups(1);
   }
-  inline size_t GetGroupID0() {
+  INLINE_FUNC size_t GetGroupID0() {
     return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
   }
 #else
-  inline size_t GetGroupID1() { return get_group_id(1); }
-  inline size_t GetGroupID0() { return get_group_id(0); }
+  INLINE_FUNC size_t GetGroupID1() { return get_group_id(1); }
+  INLINE_FUNC size_t GetGroupID0() { return get_group_id(0); }
 #endif
 
 // =================================================================================================
diff --git a/src/kernels/level1/level1.opencl b/src/kernels/level1/level1.opencl
index 7e10426b..3c60c54a 100644
--- a/src/kernels/level1/level1.opencl
+++ b/src/kernels/level1/level1.opencl
@@ -47,7 +47,7 @@ R"(
 // =================================================================================================
 
 // The vectorized multiply function
-inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
+INLINE_FUNC realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
   #if VW == 1
     Multiply(cvec, aval, bvec);
   #elif VW == 2
@@ -89,7 +89,7 @@ inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
 }
 
 // The vectorized multiply-add function
-inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
+INLINE_FUNC realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
   #if VW == 1
     MultiplyAdd(cvec, aval, bvec);
   #elif VW == 2
diff --git a/src/kernels/level2/level2.opencl b/src/kernels/level2/level2.opencl
index be979766..505231ca 100644
--- a/src/kernels/level2/level2.opencl
+++ b/src/kernels/level2/level2.opencl
@@ -33,9 +33,9 @@ R"(
 // =================================================================================================
 
 // Returns an element from a vector
-inline real LoadVector(const int id, const int max,
-                       __global real* gm, const int offset, const int inc,
-                       const int do_conjugate) {
+INLINE_FUNC real LoadVector(const int id, const int max,
+                            __global real* gm, const int offset, const int inc,
+                            const int do_conjugate) {
   if (id < max) {
     real result = gm[id*inc + offset];
     if (do_conjugate) {
@@ -53,10 +53,10 @@ inline real LoadVector(const int id, const int max,
 }
 
 // Performs the rank-1 matrix update
-inline void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
-                         __global real* agm, const int a_offset, const int a_ld,
-                         const real alpha, const real xvalue, const real yvalue,
-                         const int is_upper) {
+INLINE_FUNC void MatrixUpdate(const int id1, const int id2, const int max1, const int max2,
+                              __global real* agm, const int a_offset, const int a_ld,
+                              const real alpha, const real xvalue, const real yvalue,
+                              const int is_upper) {
 
   // Bounds of a regular matrix
   if (id1 < max1 && id2 < max2) {
@@ -100,11 +100,11 @@ inline void MatrixUpdate(const int id1, const int id2, const int max1, const int
 }
 
 // Performs the rank-2 matrix update
-inline void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
-                          __global real* agm, const int a_offset, const int a_ld,
-                          const real alpha1, const real xvalue, const real yvalue,
-                          const real alpha2, const real xtvalue, const real ytvalue,
-                          const int is_upper) {
+INLINE_FUNC void MatrixUpdate2(const int id1, const int id2, const int max1, const int max2,
+                               __global real* agm, const int a_offset, const int a_ld,
+                               const real alpha1, const real xvalue, const real yvalue,
+                               const real alpha2, const real xtvalue, const real ytvalue,
+                               const int is_upper) {
 
   // Bounds of a regular matrix
   if (id1 < max1 && id2 < max2) {
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index ff011acd..ea0478f0 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -36,9 +36,9 @@ R"(
 // =================================================================================================
 
 // Defines how to load the input matrix in the non-vectorized case
-inline real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
-                        const int a_ld, const int a_offset, const int parameter,
-                        const int kl, const int ku) {
+INLINE_FUNC real LoadMatrixA(const __global real* restrict agm, const int x, const int y,
+                             const int a_ld, const int a_offset, const int parameter,
+                             const int kl, const int ku) {
   real result;
 
   // For banded matrices
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 02a1f956..8a08f076 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -75,8 +75,8 @@ R"(
 // =================================================================================================
 
 // Loads a vector input value
-inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
-                            const int a_ld) {
+INLINE_FUNC realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
+                                 const int a_ld) {
   return agm[a_ld*y + x];
 }
 
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index 93b89187..6eeadbd1 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,14 +24,14 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-inline void _CopyPadMatrix(const int src_one, const int src_two,
-                           const int src_ld, const int src_offset,
-                           __global const real* restrict src,
-                           const int dest_one, const int dest_two,
-                           const int dest_ld, const int dest_offset,
-                           __global real* dest,
-                           const real alpha,
-                           const int do_conjugate) {
+INLINE_FUNC void _CopyPadMatrix(const int src_one, const int src_two,
+                                const int src_ld, const int src_offset,
+                                __global const real* restrict src,
+                                const int dest_one, const int dest_two,
+                                const int dest_ld, const int dest_offset,
+                                __global real* dest,
+                                const real alpha,
+                                const int do_conjugate) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -79,15 +79,15 @@ void CopyPadMatrix(const int src_one, const int src_two,
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-inline void _CopyMatrix(const int src_one, const int src_two,
-                        const int src_ld, const int src_offset,
-                        __global const real* restrict src,
-                        const int dest_one, const int dest_two,
-                        const int dest_ld, const int dest_offset,
-                        __global real* dest,
-                        const real alpha,
-                        const int upper, const int lower,
-                        const int diagonal_imag_zero) {
+INLINE_FUNC void _CopyMatrix(const int src_one, const int src_two,
+                             const int src_ld, const int src_offset,
+                             __global const real* restrict src,
+                             const int dest_one, const int dest_two,
+                             const int dest_ld, const int dest_offset,
+                             __global real* dest,
+                             const real alpha,
+                             const int upper, const int lower,
+                             const int diagonal_imag_zero) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index 874c1510..93241700 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -164,10 +164,10 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src
 // =================================================================================================
 
 // Triple matrix-multiplication kernel: C = A * B
-inline void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
-                         __global const real* agm, __global const real* bgm, __global real* cgm,
-                         const int lda, const int ldb, const int ldc,
-                         int current_size, int num_pages, const int block_size) {
+INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n,
+                              __global const real* agm, __global const real* bgm, __global real* cgm,
+                              const int lda, const int ldb, const int ldc,
+                              int current_size, int num_pages, const int block_size) {
 
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int by   = get_group_id(1) / num_pages;
@@ -250,9 +250,9 @@ inline void TripleMatMul(const int size, const bool upper, const int part, __loc
 // =================================================================================================
 
 // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower)
-inline void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
-                              __global const real* src, const int a_offset, const int lda,
-                              __global real* dest, int current_size, int num_pages, const int block_size) {
+INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n,
+                                   __global const real* src, const int a_offset, const int lda,
+                                   __global real* dest, int current_size, int num_pages, const int block_size) {
 
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
@@ -286,8 +286,8 @@ inline void TripleMatMulPart1(const int size, const bool upper, __local real* bl
 }
 
 // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower)
-inline void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
-                              __global real* dest, int current_size, int num_pages, const int block_size) {
+INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n,
+                                   __global real* dest, int current_size, int num_pages, const int block_size) {
 
   // Emulates a 3D grid: NX * (NY * num_pages)
   const int page = get_group_id(1) % num_pages;
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index fb60ce75..49c5b9a3 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,15 +24,15 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-inline void _TransposePadMatrix(__local real* tile,
-                                const int src_one, const int src_two,
-                                const int src_ld, const int src_offset,
-                                __global const real* restrict src,
-                                const int dest_one, const int dest_two,
-                                const int dest_ld, const int dest_offset,
-                                __global real* dest,
-                                const real alpha,
-                                const int do_conjugate) {
+INLINE_FUNC void _TransposePadMatrix(__local real* tile,
+                                     const int src_one, const int src_two,
+                                     const int src_ld, const int src_offset,
+                                     __global const real* restrict src,
+                                     const int dest_one, const int dest_two,
+                                     const int dest_ld, const int dest_offset,
+                                     __global real* dest,
+                                     const real alpha,
+                                     const int do_conjugate) {
 
   // Loop over the work per thread
   #pragma unroll
@@ -105,16 +105,16 @@ void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-inline void _TransposeMatrix(__local real* tile,
-                             const int src_one, const int src_two,
-                             const int src_ld, const int src_offset,
-                             __global const real* restrict src,
-                             const int dest_one, const int dest_two,
-                             const int dest_ld, const int dest_offset,
-                             __global real* dest,
-                             const real alpha,
-                             const int upper, const int lower,
-                             const int diagonal_imag_zero) {
+INLINE_FUNC void _TransposeMatrix(__local real* tile,
+                                  const int src_one, const int src_two,
+                                  const int src_ld, const int src_offset,
+                                  __global const real* restrict src,
+                                  const int dest_one, const int dest_two,
+                                  const int dest_ld, const int dest_offset,
+                                  __global real* dest,
+                                  const real alpha,
+                                  const int upper, const int lower,
+                                  const int diagonal_imag_zero) {
 
   // Loop over the work per thread
   #pragma unroll
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
index a8bd450e..8b650589 100644
--- a/src/kernels/level3/xgemm_direct_part1.opencl
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -93,7 +93,7 @@ R"(
 // =================================================================================================
 
 // Initializes the accumulation registers to zero
-inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+INLINE_FUNC void InitAccRegistersDirect(real cpm[NWID][MWID]) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     #pragma unroll
@@ -106,7 +106,7 @@ inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
 // =================================================================================================
 
 // Performs the actual computation: Cpm += Apm * Bpm
-inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+INLINE_FUNC void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     #pragma unroll
@@ -120,9 +120,9 @@ inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real
 
 // Loads global off-chip memory into thread-private register files. This function is specific for
 // loading the A input matrix.
-inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
-                                   const int a_ld, const int a_offset, const int idm, const int idk,
-                                   const int a_transpose, const int a_conjugate) {
+INLINE_FUNC void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+                                        const int a_ld, const int a_offset, const int idm, const int idk,
+                                        const int a_transpose, const int a_conjugate) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
@@ -132,9 +132,9 @@ inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
-                                   const int b_ld, const int b_offset, const int idn, const int idk,
-                                   const int b_transpose, const int b_conjugate) {
+INLINE_FUNC void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+                                        const int b_ld, const int b_offset, const int idn, const int idk,
+                                        const int b_transpose, const int b_conjugate) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
@@ -145,10 +145,10 @@ inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[
 
 // Loads global off-chip memory into thread-private register files. This function is specific for
 // loading the A input matrix. This is the same as above but now includes a bounds check.
-inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
-                                    const int a_ld, const int a_offset, const int idm, const int idk,
-                                    const int a_transpose, const int a_conjugate,
-                                    const int kSizeM) {
+INLINE_FUNC void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+                                         const int a_ld, const int a_offset, const int idm, const int idk,
+                                         const int a_transpose, const int a_conjugate,
+                                         const int kSizeM) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     if (idm + mi < kSizeM) {
@@ -163,10 +163,10 @@ inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
-                                    const int b_ld, const int b_offset, const int idn, const int idk,
-                                    const int b_transpose, const int b_conjugate,
-                                    const int kSizeN) {
+INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+                                         const int b_ld, const int b_offset, const int idn, const int idk,
+                                         const int b_transpose, const int b_conjugate,
+                                         const int kSizeN) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     if (idn + ni < kSizeN) {
@@ -184,8 +184,8 @@ inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm
 
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
-inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
-                                  const int a_transpose) {
+INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+                                       const int a_transpose) {
   #pragma unroll
   for (int mi=0; mi<MWID; ++mi) {
     const int mg = mi + get_local_id(0)*MWID;
@@ -195,8 +195,8 @@ inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int k
 }
 
 // Same as above, but now for the B input matrix
-inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
-                                  const int b_transpose) {
+INLINE_FUNC void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+                                       const int b_transpose) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     const int ng = ni + get_local_id(1)*NWID;
@@ -209,10 +209,10 @@ inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int k
 
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
-                               const int idm, const int idn,
-                               const real alpha, const real beta,
-                               const int c_ld, const int c_offset, const int c_transpose) {
+INLINE_FUNC void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+                                    const int idm, const int idn,
+                                    const real alpha, const real beta,
+                                    const int c_ld, const int c_offset, const int c_transpose) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     #pragma unroll
@@ -237,10 +237,10 @@ inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
 
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
-                                const int idm, const int idn, const int kSizeM, const int kSizeN,
-                                const real alpha, const real beta,
-                                const int c_ld, const int c_offset, const int c_transpose) {
+INLINE_FUNC void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+                                     const int idm, const int idn, const int kSizeM, const int kSizeN,
+                                     const real alpha, const real beta,
+                                     const int c_ld, const int c_offset, const int c_transpose) {
   #pragma unroll
   for (int ni=0; ni<NWID; ++ni) {
     #pragma unroll
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
index 3af14bff..1d9330fc 100644
--- a/src/kernels/level3/xgemm_direct_part2.opencl
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -19,9 +19,9 @@ R"(
 
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
-inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
-                                 const int a_ld, const int a_offset, const int kwg,
-                                 const int a_transpose, const int a_conjugate) {
+INLINE_FUNC void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+                                      const int a_ld, const int a_offset, const int kwg,
+                                      const int a_transpose, const int a_conjugate) {
   #if MDIMCD == MDIMAD
     const int la0 = get_local_id(0);
     const int la1 = get_local_id(1);
@@ -90,9 +90,9 @@ inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local re
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
-                                 const int b_ld, const int b_offset, const int kwg,
-                                 const int b_transpose, const int b_conjugate) {
+INLINE_FUNC void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+                                      const int b_ld, const int b_offset, const int kwg,
+                                      const int b_transpose, const int b_conjugate) {
   #if MDIMCD == NDIMBD
     const int lb0 = get_local_id(0);
     const int lb1 = get_local_id(1);
@@ -165,9 +165,9 @@ inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local re
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs doesn't
 // use the vector data-types.
-inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
-                                 const int a_ld, const int a_offset, const int kwg,
-                                 const int a_transpose, const int a_conjugate) {
+INLINE_FUNC void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+                                      const int a_ld, const int a_offset, const int kwg,
+                                      const int a_transpose, const int a_conjugate) {
   #if MDIMCD == MDIMAD
     const int la0 = get_local_id(0);
     const int la1 = get_local_id(1);
@@ -196,9 +196,9 @@ inline void GlobalToLocalScalarA(const __global real* restrict agms, __local rea
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
-                                 const int b_ld, const int b_offset, const int kwg,
-                                 const int b_transpose, const int b_conjugate) {
+INLINE_FUNC void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+                                      const int b_ld, const int b_offset, const int kwg,
+                                      const int b_transpose, const int b_conjugate) {
   #if MDIMCD == NDIMBD
     const int lb0 = get_local_id(0);
     const int lb1 = get_local_id(1);
@@ -231,10 +231,10 @@ inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local rea
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix. In contrast to the functions above, this function performs bounds
 // checks and doesn't use the vector data-types.
-inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
-                                  const int a_ld, const int a_offset, const int kwg,
-                                  const int a_transpose, const int a_conjugate,
-                                  const int kSizeM, const int kSizeK) {
+INLINE_FUNC void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+                                       const int a_ld, const int a_offset, const int kwg,
+                                       const int a_transpose, const int a_conjugate,
+                                       const int kSizeM, const int kSizeK) {
   #if MDIMCD == MDIMAD
     const int la0 = get_local_id(0);
     const int la1 = get_local_id(1);
@@ -270,10 +270,10 @@ inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local re
 }
 
 // Same as above, but now for the B input matrix
-inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
-                                  const int b_ld, const int b_offset, const int kwg,
-                                  const int b_transpose, const int b_conjugate,
-                                  const int kSizeN, const int kSizeK) {
+INLINE_FUNC void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+                                       const int b_ld, const int b_offset, const int kwg,
+                                       const int b_transpose, const int b_conjugate,
+                                       const int kSizeN, const int kSizeK) {
   #if MDIMCD == NDIMBD
     const int lb0 = get_local_id(0);
     const int lb1 = get_local_id(1);
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
index c04cdeb8..b0beb614 100644
--- a/src/kernels/level3/xgemm_direct_part3.opencl
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -18,15 +18,15 @@ R"(
 // =================================================================================================
 
 // Main body of the kernel. This is the direct version without pre/post processing and restrictions.
-inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
-                        const real_arg arg_alpha,
-                        const real_arg arg_beta,
-                        const __global realMD* restrict agm, const int a_offset, const int a_ld,
-                        const __global realND* restrict bgm, const int b_offset, const int b_ld,
-                        __global real* cgm, const int c_offset, const int c_ld,
-                        __local real* alm, __local real* blm,
-                        const int a_transpose, const int b_transpose, const int c_transpose,
-                        const int a_conjugate, const int b_conjugate) {
+INLINE_FUNC void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+                             const real_arg arg_alpha,
+                             const real_arg arg_beta,
+                             const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                             const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                             __global real* cgm, const int c_offset, const int c_ld,
+                             __local real* alm, __local real* blm,
+                             const int a_transpose, const int b_transpose, const int c_transpose,
+                             const int a_conjugate, const int b_conjugate) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index d0ce06ad..07dafe13 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -135,7 +135,7 @@ R"(
 // =================================================================================================
 
 // Initializes the accumulation registers to zero
-inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
+INLINE_FUNC void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #pragma unroll
@@ -186,8 +186,8 @@ inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) {
 // Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
 // caching the A input matrix.
 #if SA == 1
-inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
-                           const int kSizeM, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm,
+                                const int kSizeM, const int tid, const int kwg) {
   const int la0 = tid % MDIMA;
   const int la1 = tid / MDIMA;
   #pragma unroll
@@ -216,8 +216,8 @@ inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* al
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
-                           const int kSizeN, const int tid, const int kwg) {
+INLINE_FUNC void GlobalToLocalB(const __global realN* restrict bgm, __local realN* blm,
+                                const int kSizeN, const int tid, const int kwg) {
   const int lb0 = tid % NDIMB;
   const int lb1 = tid / NDIMB;
   #pragma unroll
@@ -249,8 +249,8 @@ inline void GlobalToLocalB(const __global realN* restrict bgm, __local realN* bl
 // Caches global off-chip memory directly into per-thread private memory (registers). This function
 // is specific for caching the A input matrix.
 #if SA == 0
-inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
-                             const int kSizeM, const int idk, const int kwg) {
+INLINE_FUNC void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/VWM],
+                                  const int kSizeM, const int idk, const int kwg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
 
@@ -272,8 +272,8 @@ inline void GlobalToPrivateA(const __global realM* restrict agm, realM apm[MWI/V
 
 // Same as above, but now for the B input matrix
 #if SB == 0
-inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
-                             const int kSizeN, const int idk) {
+INLINE_FUNC void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/VWN],
+                                  const int kSizeN, const int idk) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
 
@@ -298,7 +298,7 @@ inline void GlobalToPrivateB(const __global realN* restrict bgm, realN bpm[NWI/V
 // Caches on-chip local memory into per-thread private memory (registers). This function is specific
 // for caching the A input matrix.
 #if SA == 1
-inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
+INLINE_FUNC void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg) {
   #pragma unroll
   for (int mi=0; mi<MWI/VWM; ++mi) {
     #if STRM == 0
@@ -313,7 +313,7 @@ inline void LocalToPrivateA(__local realM* alm, realM apm[MWI/VWM], const int kg
 
 // Same as above, but now for the B input matrix
 #if SB == 1
-inline void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
+INLINE_FUNC void LocalToPrivateB(__local realN* blm, realN bpm[NWI/VWN], const int kg) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
     #if STRN == 0
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index e8234a29..06fafc8f 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -18,7 +18,7 @@ R"(
 // =================================================================================================
 
 // The vectorised multiply-add function
-inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
+INLINE_FUNC realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
   #if USE_VECTOR_MAD == 1
     cvec += avec * bval;
   #else
@@ -64,7 +64,7 @@ inline realM MultiplyAddVector(realM cvec, const realM avec, const real bval) {
 }
 
 // Performs the actual computation: Cpm += Apm * Bpm
-inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
+INLINE_FUNC void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], realN bpm[NWI/VWN]) {
   #pragma unroll
   for (int ni=0; ni<NWI/VWN; ++ni) {
     #pragma unroll
@@ -115,8 +115,8 @@ inline void MultiplyAccumulate(realM cpm[NWI][MWI/VWM], realM apm[MWI/VWM], real
 
 // Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
 // with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
-inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
-                         const real alpha, const real beta) {
+INLINE_FUNC void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int kSizeM,
+                              const real alpha, const real beta) {
   #pragma unroll
   for (int ni=0; ni<NWI; ++ni) {
     #pragma unroll
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 8ac3a3a8..3f0d590d 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -18,17 +18,17 @@ R"(
 // =================================================================================================
 
 // Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                      const __global realM* restrict agm, const __global realN* restrict bgm,
-                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
-                      #if SA == 1 && SB == 1
-                        , __local realM* alm, __local realN* blm
-                      #elif SA == 1
-                        , __local realM* alm
-                      #elif SB == 1
-                        , __local realN* blm
-                      #endif
-                      ) {
+INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                           const __global realM* restrict agm, const __global realN* restrict bgm,
+                           __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                           #if SA == 1 && SB == 1
+                             , __local realM* alm, __local realN* blm
+                           #elif SA == 1
+                             , __local realM* alm
+                           #elif SB == 1
+                             , __local realN* blm
+                           #endif
+                           ) {
 
   // Allocates workitem-private memory (registers)
   realM apm[MWI/VWM];
diff --git a/src/routine.cpp b/src/routine.cpp
index 81baa590..7d4ed76f 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -135,7 +135,13 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
   // Adds the name of the routine as a define
   source_string += "#define ROUTINE_"+routine_name_+"\n";
 
-  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
+  // Not all OpenCL compilers support the 'inline' keyword. The keyword is only used for devices on
+  // which it is known to work with all OpenCL platforms.
+  if (device_.IsNVIDIA() || device_.IsARM()) {
+    source_string += "#define USE_INLINE_KEYWORD 1\n";
+  }
+
+  // For specific devices, use the non-IEE754 compliant OpenCL mad() instruction. This can improve
   // performance, but might result in a reduced accuracy.
   if (device_.IsAMD() && device_.IsGPU()) {
     source_string += "#define USE_CL_MAD 1\n";
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 30e5999c..f4611aba 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -280,11 +280,11 @@ void Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
   kernel.SetArgument(16, static_cast<int>(b_conjugate));
 
   // Computes the global and local thread sizes
-  const auto m_ceiled = Ceil(m, db_["WGD"]);
-  const auto n_ceiled = Ceil(n, db_["WGD"]);
+  //const auto m_ceiled = Ceil(m, db_["WGD"]);
+  //const auto n_ceiled = Ceil(n, db_["WGD"]);
   const auto global = std::vector<size_t>{
-    (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
-    (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+    CeilDiv(m * db_["MDIMCD"], db_["WGD"]),
+    CeilDiv(n * db_["NDIMCD"], db_["WGD"])
   };
   const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
 
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 0fea1922..ee8448d2 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -94,8 +94,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
 
   // Tests the matrices for validity
   for (auto batch = size_t{0}; batch < batch_count; ++batch) {
-    TestMatrixA(a_one, a_two, a_buffer, a_offsets[batch], a_ld);
-    TestMatrixB(b_one, b_two, b_buffer, b_offsets[batch], b_ld);
+    TestMatrixA(a_one, a_two, a_buffer, a_offsets[batch], a_ld, false); // don't test for invalid LD
+    TestMatrixB(b_one, b_two, b_buffer, b_offsets[batch], b_ld, false); // don't test for invalid LD
     TestMatrixC(c_one, c_two, c_buffer, c_offsets[batch], c_ld);
   }
 
diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp
index 652ab8c6..b5693181 100644
--- a/src/utilities/buffer_test.hpp
+++ b/src/utilities/buffer_test.hpp
@@ -23,8 +23,8 @@ namespace clblast {
 // Tests matrix 'A' for validity
 template <typename T>
 void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                 const size_t offset, const size_t ld) {
-  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
+                 const size_t offset, const size_t ld, const bool test_lead_dim = true) {
+  if (test_lead_dim && ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
   try {
     const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
     if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryA); }
@@ -34,8 +34,8 @@ void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
 // Tests matrix 'B' for validity
 template <typename T>
 void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                 const size_t offset, const size_t ld) {
-  if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
+                 const size_t offset, const size_t ld, const bool test_lead_dim = true) {
+  if (test_lead_dim && ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
   try {
     const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
     if (buffer.GetSize() < required_size) { throw BLASError(StatusCode::kInsufficientMemoryB); }
diff --git a/test/correctness/testblas.hpp b/test/correctness/testblas.hpp
index a553687d..b377be47 100644
--- a/test/correctness/testblas.hpp
+++ b/test/correctness/testblas.hpp
@@ -343,6 +343,12 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
                 auto i_args = args;
                 i_args.m = i_args.n = i_args.k = i_args.kl = i_args.ku = tester.kBufferSize;
                 i_args.a_ld = i_args.b_ld = i_args.c_ld = tester.kBufferSize;
+                i_args.batch_count = 3;
+                i_args.alphas = std::vector<U>(i_args.batch_count);
+                i_args.betas = std::vector<U>(i_args.batch_count);
+                i_args.a_offsets = std::vector<size_t>(i_args.batch_count);
+                i_args.b_offsets = std::vector<size_t>(i_args.batch_count);
+                i_args.c_offsets = std::vector<size_t>(i_args.batch_count);
                 for (auto &x_size: x_sizes) { i_args.x_size = x_size;
                   for (auto &y_size: y_sizes) { i_args.y_size = y_size;
                     for (auto &a_size: a_sizes) { i_args.a_size = a_size;
diff --git a/test/routines/levelx/xgemmbatched.hpp b/test/routines/levelx/xgemmbatched.hpp
index 56823e47..704d0578 100644
--- a/test/routines/levelx/xgemmbatched.hpp
+++ b/test/routines/levelx/xgemmbatched.hpp
@@ -110,6 +110,15 @@ class TestXgemmBatched {
   static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
     auto queue_plain = queue();
     auto event = cl_event{};
+    // Relaxed requirement on ld_a and ld_b within the library, this is here to match clBLAS
+    auto a_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    auto b_rotated = (args.layout == Layout::kColMajor && args.b_transpose != Transpose::kNo) ||
+                     (args.layout == Layout::kRowMajor && args.b_transpose == Transpose::kNo);
+    auto a_one = (!a_rotated) ? args.m : args.k;
+    auto b_one = (!b_rotated) ? args.k : args.n;
+    if (args.a_ld < a_one) { return StatusCode::kInvalidLeadDimA; }
+    if (args.b_ld < b_one) { return StatusCode::kInvalidLeadDimB; }
     auto status = GemmBatched(args.layout, args.a_transpose, args.b_transpose,
                               args.m, args.n, args.k, args.alphas.data(),
                               buffers.a_mat(), args.a_offsets.data(), args.a_ld,
author	mcian <mcian86@gmail.com>	2017-07-31 14:02:12 +0200
committer	GitHub <noreply@github.com>	2017-07-31 14:02:12 +0200
commit	dc499065424af5dbf29a85bc6cbf4db9cd565cfd (patch)
tree	3a7f9e86d30a1c700ae264caf46f0187c1e13177
parent	f2477f663672fd37301d6e2ce4646519f71d5cce (diff)
parent	1155c068e982b1af19230c4c2d2e6dcb1d495414 (diff)