From 56483347e8b70c672f9173ef7e7c38b9e2bc10bf Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 28 Jun 2016 22:33:13 +0200
Subject: Prepared the changelog for the next release

---
 CHANGELOG | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index b49424c9..56e9e90b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,7 @@
 
+Development version (next release)
+-
+
 Version 0.8.0
 - Added support for half-precision floating-point (fp16) in the library
 - Made it possible to compile the performance tests (clients) separately from the correctness tests
-- 
cgit v1.2.3


From cd74aaac5290d14ba03ad13bb2fffa1040ccff5d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 29 Jun 2016 19:42:49 +0200
Subject: Updated to version 6.0 of the CLCudaAPI header

---
 CHANGELOG      |  2 +-
 src/clpp11.hpp | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 56e9e90b..3c258bdf 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,6 @@
 
 Development version (next release)
--
+- Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 
 Version 0.8.0
 - Added support for half-precision floating-point (fp16) in the library
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index b834d8b4..f8bc2b02 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -163,6 +163,15 @@ class Device {
 
   // Methods to retrieve device information
   std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
+  size_t VersionNumber() const
+  {
+    std::string version_string = Version().substr(7);
+    // Space separates the end of the OpenCL version number from the beginning of the
+    // vendor-specific information.
+    size_t next_whitespace = version_string.find(' ');
+    size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
+    return version;
+  }
   std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
   std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
   std::string Type() const {
@@ -211,6 +220,8 @@ class Device {
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
   bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
+  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -386,8 +397,16 @@ class Queue {
                                                              delete s; }) {
     auto status = CL_SUCCESS;
     #ifdef CL_VERSION_2_0
-      cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-      *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+      size_t ocl_version = device.VersionNumber();
+      if (ocl_version >= 200)
+      {
+        cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+        *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+      }
+      else
+      {
+        *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+      }
     #else
       *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
     #endif
-- 
cgit v1.2.3


From b330ab086640382157688ea6b9633b5f0a22dac3 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 30 Jun 2016 10:49:17 +0200
Subject: Added declspec(dllexport) to ClearCache and FillCache, and added
 declspec(dllimport) when not building the library

---
 CHANGELOG                      |  1 +
 CMakeLists.txt                 |  7 ++++++-
 include/clblast.h              | 16 ++++++++++++++--
 include/clblast_c.h            |  6 +++++-
 scripts/generator/generator.py |  2 +-
 src/clblast.cpp                |  1 -
 src/public_api.hpp             | 34 ----------------------------------
 7 files changed, 27 insertions(+), 40 deletions(-)
 delete mode 100644 src/public_api.hpp

diff --git a/CHANGELOG b/CHANGELOG
index 3c258bdf..fe8f7221 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 
 Development version (next release)
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
+- Fixed proper MSVC dllimport and dllexport declarations
 
 Version 0.8.0
 - Added support for half-precision floating-point (fp16) in the library
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6deee35d..70e4198c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@
 #
 # ==================================================================================================
 
-cmake_minimum_required(VERSION 2.8.10)
+cmake_minimum_required(VERSION 2.8.11)
 
 # Overrides for MSVC static runtime
 set(CMAKE_USER_MAKE_RULES_OVERRIDE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/c_flag_overrides.cmake)
@@ -166,6 +166,11 @@ endforeach()
 add_library(clblast SHARED ${SOURCES})
 target_link_libraries(clblast ${OPENCL_LIBRARIES})
 
+# Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
+if(MSVC)
+  target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
+endif()
+
 # Installs the library
 install(TARGETS clblast DESTINATION lib)
 install(FILES include/clblast.h DESTINATION include)
diff --git a/include/clblast.h b/include/clblast.h
index c8596b39..e1d4f25b 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -25,6 +25,18 @@
   #include <CL/opencl.h>
 #endif
 
+// Exports library functions under Windows when building a DLL. See also:
+// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
+#ifdef _WIN32
+  #ifdef COMPILING_DLL
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
+#else
+  #define PUBLIC_API
+#endif
+
 namespace clblast {
 // =================================================================================================
 
@@ -576,11 +588,11 @@ StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
 
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
 // for the same device. This cache can be cleared to free up system memory or in case of debugging.
-StatusCode ClearCache();
+StatusCode PUBLIC_API ClearCache();
 
 // The cache can also be pre-initialized for a specific device with all possible CLBLast kernels.
 // Further CLBlast routine calls will then run at maximum speed.
-StatusCode FillCache(const cl_device_id device);
+StatusCode PUBLIC_API FillCache(const cl_device_id device);
 
 // =================================================================================================
 
diff --git a/include/clblast_c.h b/include/clblast_c.h
index b92febac..a13b8e64 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -25,7 +25,11 @@
 // Exports library functions under Windows when building a DLL. See also:
 // https://msdn.microsoft.com/en-us/library/a90k134d.aspx
 #ifdef _WIN32
-  #define PUBLIC_API __declspec(dllexport)
+  #ifdef COMPILING_DLL
+    #define PUBLIC_API __declspec(dllexport)
+  #else
+    #define PUBLIC_API __declspec(dllimport)
+  #endif
 #else
   #define PUBLIC_API
 #endif
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index cf01f79e..6aa6fc18 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -385,7 +385,7 @@ files = [
   path_clblast+"/test/wrapper_clblas.hpp",
   path_clblast+"/test/wrapper_cblas.hpp",
 ]
-header_lines = [84, 74, 93, 22, 29, 41]
+header_lines = [96, 73, 97, 22, 29, 41]
 footer_lines = [17, 75, 19, 14, 6, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 88d60772..79c30ca4 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -16,7 +16,6 @@
 #include <string>
 
 #include "clblast.h"
-#include "public_api.hpp"
 #include "cache.hpp"
 
 // BLAS level-1 includes
diff --git a/src/public_api.hpp b/src/public_api.hpp
deleted file mode 100644
index d0732297..00000000
--- a/src/public_api.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file provides macro's to define the public API. This is needed when building a Windows DLL.
-// Note: this is only used for the C++ interface, the C interface has its own definition included in
-// the header file itself.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_PUBLIC_API_H_
-#define CLBLAST_PUBLIC_API_H_
-
-namespace clblast {
-// =================================================================================================
-
-// Exports library functions under Windows when building a DLL. See also:
-// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
-  #define PUBLIC_API __declspec(dllexport)
-#else
-  #define PUBLIC_API
-#endif
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_PUBLIC_API_H_
-#endif
-- 
cgit v1.2.3


From 7cf2f8c26882aee4cd3e95fe22967f04318b6bf7 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 2 Jul 2016 15:34:55 +0200
Subject: Fixed some memory leaks related to events not properly cleaned-up

---
 CHANGELOG         |  1 +
 samples/cache.c   |  1 +
 samples/dgemv.c   |  1 +
 samples/haxpy.c   |  1 +
 samples/sasum.c   |  1 +
 samples/sgemm.c   |  1 +
 samples/sgemm.cpp |  1 +
 src/clpp11.hpp    | 33 +++++++++++++++++++++------------
 8 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index fe8f7221..725eb116 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
 Development version (next release)
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 - Fixed proper MSVC dllimport and dllexport declarations
+- Fixed memory leaks related to events not being released
 
 Version 0.8.0
 - Added support for half-precision floating-point (fp16) in the library
diff --git a/samples/cache.c b/samples/cache.c
index 7f876be1..a592824d 100644
--- a/samples/cache.c
+++ b/samples/cache.c
@@ -113,6 +113,7 @@ void run_example_routine(const cl_device_id device) {
 
   // Wait for completion
   clWaitForEvents(1, &event);
+  clReleaseEvent(event);
 
   // Retrieves the execution time
   clock_t diff = clock() - start;
diff --git a/samples/dgemv.c b/samples/dgemv.c
index 6ea0deb0..c22c9f37 100644
--- a/samples/dgemv.c
+++ b/samples/dgemv.c
@@ -85,6 +85,7 @@ int main(void) {
 
   // Wait for completion
   clWaitForEvents(1, &event);
+  clReleaseEvent(event);
 
   // Example completed. See "clblast_c.h" for status codes (0 -> success).
   printf("Completed DGEMV with status %d\n", status);
diff --git a/samples/haxpy.c b/samples/haxpy.c
index 3c7bb33a..d5b98e12 100644
--- a/samples/haxpy.c
+++ b/samples/haxpy.c
@@ -78,6 +78,7 @@ int main(void) {
 
   // Wait for completion
   clWaitForEvents(1, &event);
+  clReleaseEvent(event);
 
   // Copies the result back to the host
   clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
diff --git a/samples/sasum.c b/samples/sasum.c
index 3fdbb0eb..1518cc13 100644
--- a/samples/sasum.c
+++ b/samples/sasum.c
@@ -74,6 +74,7 @@ int main(void) {
 
   // Wait for completion
   clWaitForEvents(1, &event);
+  clReleaseEvent(event);
 
   // Copies the result back to the host
   clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
diff --git a/samples/sgemm.c b/samples/sgemm.c
index 79f30c83..b4827777 100644
--- a/samples/sgemm.c
+++ b/samples/sgemm.c
@@ -88,6 +88,7 @@ int main(void) {
 
   // Wait for completion
   clWaitForEvents(1, &event);
+  clReleaseEvent(event);
 
   // Example completed. See "clblast_c.h" for status codes (0 -> success).
   printf("Completed SGEMM with status %d\n", status);
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
index 5fe7490a..a4b89968 100644
--- a/samples/sgemm.cpp
+++ b/samples/sgemm.cpp
@@ -96,6 +96,7 @@ int main() {
 
   // Record the execution time
   clWaitForEvents(1, &event);
+  clReleaseEvent(event);
   auto elapsed_time = std::chrono::steady_clock::now() - start_time;
   auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
 
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index f8bc2b02..1eeaf702 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -72,15 +72,24 @@ inline void CheckError(const cl_int status) {
 class Event {
  public:
 
-  // Constructor based on the regular OpenCL data-type
-  explicit Event(const cl_event event): event_(event) { }
+  // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+  explicit Event(const cl_event event):
+      event_(new cl_event) {
+    *event_ = event;
+  }
 
-  // Regular constructor
-  explicit Event(): event_(nullptr) { }
+  // Regular constructor with memory management
+  explicit Event():
+      event_(new cl_event, [](cl_event* e) {
+        if (*e) { CheckError(clReleaseEvent(*e)); }
+        delete e;
+      }) {
+    *event_ = nullptr;
+  }
 
   // Waits for completion of this event
   void WaitForCompletion() const {
-    CheckError(clWaitForEvents(1, &event_));
+    CheckError(clWaitForEvents(1, &(*event_)));
   }
 
   // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
@@ -89,20 +98,20 @@ class Event {
   float GetElapsedTime() const {
     WaitForCompletion();
     auto bytes = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
     auto time_start = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
     auto time_end = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
     return (time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
-  cl_event& operator()() { return event_; }
-  cl_event* pointer() { return &event_; }
+  cl_event& operator()() { return *event_; }
+  cl_event* pointer() { return &(*event_); }
  private:
-  cl_event event_;
+  std::shared_ptr<cl_event> event_;
 };
 
 // Pointer to an OpenCL event
-- 
cgit v1.2.3


From 5a690f4e36ea50d5401ba37d013ef425a98f2542 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 2 Jul 2016 16:44:13 +0200
Subject: Prints the current pandas version and reports the minimum required
 version

---
 scripts/database/database.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/database/database.py b/scripts/database/database.py
index 49bc1801..a91fcffa 100644
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -22,6 +22,7 @@ except ImportError:
 
 # Additional modules
 import pandas as pd
+print("## Using pandas version "+pd.__version__+", requires at least 0.18.0")
 
 # Server storing a copy of the database
 DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
-- 
cgit v1.2.3


From 7424532859ff91321da56a72ae1e9195a059a351 Mon Sep 17 00:00:00 2001
From: Gian-Carlo Pascutto <gcp@sjeng.org>
Date: Thu, 30 Jun 2016 23:57:11 +0200
Subject: Ensure clGetKernelWorkGroupInfo return value fits.

In LocalMemUsage(), there's a first call to clGetKernelWorkGroupInfo
to get the "bytes" amount needed to store the result from
CL_KERNEL_LOCAL_MEM_SIZE. However, the actual value passed is an
"auto result = size_t", which in 32-bit mode is 4 bytes, regardless
of the previous return value. The spec describes that it will actually
be a cl_ulong which is 8 bytes. To prevent stack corruption, make sure
we are in fact passing a cl_ulong.

Also adjust all callers to take the changed type into account.
---
 src/clpp11.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 1eeaf702..2b21e1e1 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -199,8 +199,8 @@ class Device {
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  size_t LocalMemSize() const {
-    return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
+  cl_ulong LocalMemSize() const {
+    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
@@ -211,7 +211,7 @@ class Device {
   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
 
   // Configuration-validity checks
-  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+  bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
     return (local_mem_usage <= LocalMemSize());
   }
   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
@@ -655,11 +655,11 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  size_t LocalMemUsage(const Device &device) const {
+  cl_ulong LocalMemUsage(const Device &device) const {
     auto bytes = size_t{0};
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
-    auto result = size_t{0};
+    auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
     return result;
   }
-- 
cgit v1.2.3


From 9683b50c557c7e415389b321d437be39ce0e519d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 3 Jul 2016 20:30:47 +0200
Subject: Added tuning results for GTX670, GTX750, and GTX1070 (thanks to gcp)

---
 CHANGELOG                             |  2 ++
 README.md                             |  4 ++++
 scripts/database/database.py          |  2 +-
 src/database/kernels/copy.hpp         | 12 ++++++++++++
 src/database/kernels/pad.hpp          | 14 +++++++++++++-
 src/database/kernels/padtranspose.hpp | 12 ++++++++++++
 src/database/kernels/transpose.hpp    | 16 ++++++++++++++--
 src/database/kernels/xaxpy.hpp        | 12 ++++++++++++
 src/database/kernels/xdot.hpp         | 14 +++++++++++++-
 src/database/kernels/xgemm.hpp        | 14 +++++++++++++-
 src/database/kernels/xgemv.hpp        | 10 ++++++++++
 src/database/kernels/xger.hpp         | 16 ++++++++++++++--
 12 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 725eb116..b0d70c80 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,8 @@ Development version (next release)
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 - Fixed proper MSVC dllimport and dllexport declarations
 - Fixed memory leaks related to events not being released
+- Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
+- Added tuned parameters for various devices (see README)
 
 Version 0.8.0
 - Added support for half-precision floating-point (fp16) in the library
diff --git a/README.md b/README.md
index ddd841e2..7fca2a35 100644
--- a/README.md
+++ b/README.md
@@ -99,9 +99,12 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
 * NVIDIA GPUs:
   - GRID K520
   - GeForce GTX 480
+  - GeForce GTX 670
   - GeForce GTX 680
+  - GeForce GTX 750
   - GeForce GTX 750 Ti
   - GeForce GTX 980
+  - GeForce GTX 1070
   - GeForce GTX Titan
   - GeForce GTX Titan X
   - Tesla K20m
@@ -278,6 +281,7 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Dragan Djuric](https://github.com/blueberry)
 * [Marco Hutter](https://github.com/gpus)
 * [Hugh Perkins](https://github.com/hughperkins)
+* [Gian-Carlo Pascutto](https://github.com/gcp)
 
 Tuning and testing on a variety of OpenCL devices was made possible by:
 
diff --git a/scripts/database/database.py b/scripts/database/database.py
index a91fcffa..a70b9fc1 100644
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -22,7 +22,7 @@ except ImportError:
 
 # Additional modules
 import pandas as pd
-print("## Using pandas version "+pd.__version__+", requires at least 0.18.0")
+print("## Using pandas version "+pd.__version__+", requires at least 0.17.0")
 
 # Server storing a copy of the database
 DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 14946af4..a76a08e7 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -75,8 +75,11 @@ const Database::DatabaseEntry Database::CopySingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
@@ -133,7 +136,10 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -186,8 +192,11 @@ const Database::DatabaseEntry Database::CopyDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
@@ -241,8 +250,11 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index db4df9f0..6bb0bc86 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -75,8 +75,11 @@ const Database::DatabaseEntry Database::PadSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -139,15 +142,18 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -194,8 +200,11 @@ const Database::DatabaseEntry Database::PadDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -249,8 +258,11 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 7fedd15a..b117f9fe 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -75,8 +75,11 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -139,8 +142,11 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -194,8 +200,11 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
@@ -249,8 +258,11 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 4229e39f..8faa53f4 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -75,15 +75,18 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
     { // Default
@@ -133,8 +136,11 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -188,8 +194,11 @@ const Database::DatabaseEntry Database::TransposeDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -237,15 +246,18 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index d8088ca2..b6ea1d7e 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -75,8 +75,11 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
@@ -139,8 +142,11 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
@@ -194,8 +200,11 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",8} } },
         { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
@@ -249,8 +258,11 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 48288f95..8670c50b 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -60,8 +60,11 @@ const Database::DatabaseEntry Database::XdotSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",1024} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
+        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
@@ -106,8 +109,11 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -144,12 +150,15 @@ const Database::DatabaseEntry Database::XdotDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",512} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
       }
     },
     { // Default
@@ -182,8 +191,11 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 27cebc8a..f2cd46d1 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -69,15 +69,18 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
         { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Default
@@ -133,8 +136,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -188,8 +194,11 @@ const Database::DatabaseEntry Database::XgemmDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
@@ -243,8 +252,11 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 1070",                                { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index ce258f2f..0f66e497 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -68,8 +68,11 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
         { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
@@ -125,8 +128,11 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
@@ -169,8 +175,11 @@ const Database::DatabaseEntry Database::XgemvDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
@@ -218,6 +227,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
       }
     },
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3727cc57..f02482a0 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -67,10 +67,13 @@ const Database::DatabaseEntry Database::XgerSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // Default
@@ -118,8 +121,11 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
       }
@@ -161,10 +167,13 @@ const Database::DatabaseEntry Database::XgerDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
       }
     },
     { // Default
@@ -204,8 +213,11 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "default",                                         { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
       }
-- 
cgit v1.2.3


From 2d665099ef6b14713beb0cac1bd405073a49e791 Mon Sep 17 00:00:00 2001
From: CNugteren <web@cedricnugteren.nl>
Date: Mon, 4 Jul 2016 19:46:14 +0200
Subject: Fixed a linking issue with the tuners on Visual Studio

---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70e4198c..adb87658 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,9 +221,15 @@ if(TUNERS)
   # Includes CLTune
   include_directories(${CLTUNE_INCLUDE_DIRS})
 
+  # Visual Studio requires the sources of non-exported objects/libraries
+  set(TUNERS_COMMON )
+  if(MSVC)
+    set(TUNERS_COMMON ${TUNERS_COMMON} src/utilities.cpp)
+  endif()
+
   # Adds tuning executables
   foreach(KERNEL ${KERNELS})
-    add_executable(clblast_tuner_${KERNEL} src/tuning/kernels/${KERNEL}.cpp)
+    add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
     target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
-- 
cgit v1.2.3


From 77325b8974e19188fc5afad1447d4df4f9ae30fd Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 6 Jul 2016 21:25:55 +0200
Subject: Added an option to the performance clients to do a warm-up run before
 timing

---
 CHANGELOG                   |  1 +
 src/utilities.hpp           |  3 ++-
 test/performance/client.cpp | 15 ++++++++++++++-
 test/performance/client.hpp |  3 +++
 4 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index b0d70c80..15948bbd 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@ Development version (next release)
 - Fixed proper MSVC dllimport and dllexport declarations
 - Fixed memory leaks related to events not being released
 - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
+- Added an option (-warm_up) to do a warm-up run before timing in the performance clients
 - Added tuned parameters for various devices (see README)
 
 Version 0.8.0
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 5a4eef0f..d5efab9f 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -80,8 +80,9 @@ constexpr auto kArgComparecblas = "cblas";
 constexpr auto kArgStepSize = "step";
 constexpr auto kArgNumSteps = "num_steps";
 constexpr auto kArgNumRuns = "runs";
+constexpr auto kArgWarmUp = "warm_up";
 
-// The client-specific arguments in string form
+// The test-specific arguments in string form
 constexpr auto kArgFullTest = "full_test";
 constexpr auto kArgVerbose = "verbose";
 
diff --git a/test/performance/client.cpp b/test/performance/client.cpp
index d0068f8b..aaaab22e 100644
--- a/test/performance/client.cpp
+++ b/test/performance/client.cpp
@@ -113,6 +113,7 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t le
   args.print_help     = CheckArgument(argc, argv, help, kArgHelp);
   args.silent         = CheckArgument(argc, argv, help, kArgQuiet);
   args.no_abbrv       = CheckArgument(argc, argv, help, kArgNoAbbreviations);
+  warm_up_            = CheckArgument(argc, argv, help, kArgWarmUp);
 
   // Prints the chosen (or defaulted) arguments to screen. This also serves as the help message,
   // which is thus always displayed (unless silence is specified).
@@ -244,12 +245,24 @@ template <typename T, typename U>
 double Client<T,U>::TimedExecution(const size_t num_runs, const Arguments<U> &args,
                                    Buffers<T> &buffers, Queue &queue,
                                    Routine run_blas, const std::string &library_name) {
+  auto status = StatusCode::kSuccess;
+
+  // Do an optional warm-up to omit compilation times and initialisations from the measurements
+  if (warm_up_) {
+    try {
+      status = run_blas(args, buffers, queue);
+    } catch (...) { status = static_cast<StatusCode>(kUnknownError); }
+    if (status != StatusCode::kSuccess) {
+      throw std::runtime_error(library_name+" error: "+ToString(static_cast<int>(status)));
+    }
+  }
+
+  // Start the timed part
   auto timings = std::vector<double>(num_runs);
   for (auto &timing: timings) {
     auto start_time = std::chrono::steady_clock::now();
 
     // Executes the main computation
-    auto status = StatusCode::kSuccess;
     try {
       status = run_blas(args, buffers, queue);
     } catch (...) { status = static_cast<StatusCode>(kUnknownError); }
diff --git a/test/performance/client.hpp b/test/performance/client.hpp
index 5ff2aec7..6d35fced 100644
--- a/test/performance/client.hpp
+++ b/test/performance/client.hpp
@@ -82,6 +82,9 @@ class Client {
   const std::vector<std::string> options_;
   const GetMetric get_flops_;
   const GetMetric get_bytes_;
+
+  // Extra arguments
+  bool warm_up_; // if enabled, do a warm-up run first before measuring execution time
 };
 
 // =================================================================================================
-- 
cgit v1.2.3


From 27854070b4f9ba1d58ccd7189032e56325506597 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Wed, 6 Jul 2016 21:50:12 +0200
Subject: Added a VERBOSE mode to debug performance: now prints details about
 compilation and kernel execution to screen

---
 CMakeLists.txt          |  7 +++++++
 README.md               |  2 ++
 src/clpp11.hpp          | 10 ++++++++++
 src/routine.cpp         | 15 +++++++++++++++
 src/routines/common.cpp | 16 ++++++++++++++++
 5 files changed, 50 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adb87658..77d1cd08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,13 @@ option(TUNERS "Enable compilation of the tuners" OFF)
 option(CLIENTS "Enable compilation of the clients to test and compare performance" OFF)
 option(TESTS "Enable compilation of the correctness tests" OFF)
 
+# Compile in verbose mode with additional diagnostic messages
+option(VERBOSE "Compile in verbose mode for additional diagnostic messages" OFF)
+if(VERBOSE)
+  message("-- Building in verbose mode")
+  add_definitions(-DVERBOSE)
+endif()
+
 # ==================================================================================================
 
 # RPATH settings
diff --git a/README.md b/README.md
index 7fca2a35..5f3e6529 100644
--- a/README.md
+++ b/README.md
@@ -183,6 +183,8 @@ The folder `doc/performance` contains some PDF files with performance results on
 
 Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See above under `Using the tuners` to find out how to tune for your device.
 
+In case performance is still sub-optimal or something else is wrong, CLBlast can be build in verbose mode for (performance) debugging by specifying `-DVERBOSE=ON` to CMake.
+
 
 Supported routines
 -------------
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 2b21e1e1..fcb71e38 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -664,6 +664,16 @@ class Kernel {
     return result;
   }
 
+  // Retrieves the name of the kernel
+  std::string GetFunctionName() {
+    auto bytes = size_t{0};
+    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
+    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
+  }
+
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event) {
diff --git a/src/routine.cpp b/src/routine.cpp
index d3590896..3c3343da 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -13,6 +13,7 @@
 
 #include <string>
 #include <vector>
+#include <chrono>
 
 #include "routine.hpp"
 
@@ -103,6 +104,13 @@ StatusCode Routine::SetUp() {
   // Combines everything together into a single source string
   const auto source_string = defines + common_header + source_string_;
 
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
+           routine_name_.c_str(), ToString(precision_).c_str(), device_name_.c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
   // Compiles the kernel
   try {
     auto program = Program(context_, source_string);
@@ -123,6 +131,13 @@ StatusCode Routine::SetUp() {
     StoreProgramToCache(program, context_, precision_, routine_name_);
   } catch (...) { return StatusCode::kBuildProgramFailure; }
 
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
   // No errors, normal termination of this function
   return StatusCode::kSuccess;
 }
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index c378df28..2e82e04d 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -12,6 +12,7 @@
 // =================================================================================================
 
 #include <vector>
+#include <chrono>
 
 #include "routines/common.hpp"
 
@@ -44,11 +45,26 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
   const auto local_mem_usage = kernel.LocalMemUsage(device);
   if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
 
+  // Prints the name of the kernel to launch in case of debugging in verbose mode
+  #ifdef VERBOSE
+    queue.Finish();
+    printf("[DEBUG] Running kernel '%s'\n", kernel.GetFunctionName().c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
   // Launches the kernel (and checks for launch errors)
   try {
     kernel.Launch(queue, global, local, event, waitForEvents);
   } catch (...) { return StatusCode::kKernelLaunchError; }
 
+  // Prints the elapsed execution time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    queue.Finish();
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
+  #endif
+
   // No errors, normal termination of this function
   return StatusCode::kSuccess;
 }
-- 
cgit v1.2.3


From 9caa7ca5b9c1fdf99473582cd357506dffd51b44 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Fri, 8 Jul 2016 20:57:58 +0200
Subject: Cache now compares cl_context instead of a pointer to a context;
 added verbose print statements to the cache

---
 CHANGELOG     |  1 +
 src/cache.cpp | 12 ++++++++++++
 src/cache.hpp |  2 +-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 15948bbd..248db397 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,7 @@ Development version (next release)
 - Fixed proper MSVC dllimport and dllexport declarations
 - Fixed memory leaks related to events not being released
 - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
+- Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
 - Added tuned parameters for various devices (see README)
 
diff --git a/src/cache.cpp b/src/cache.cpp
index cd9055d0..2b90eccc 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -23,6 +23,9 @@ namespace clblast {
 // Stores the compiled binary or IR in the cache
 void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
                         const Precision &precision, const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Storing binary in cache\n");
+  #endif
   binary_cache_mutex_.lock();
   binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
   binary_cache_mutex_.unlock();
@@ -31,6 +34,9 @@ void StoreBinaryToCache(const std::string &binary, const std::string &device_nam
 // Stores the compiled program in the cache
 void StoreProgramToCache(const Program &program, const Context &context,
                          const Precision &precision, const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Storing program in cache\n");
+  #endif
   program_cache_mutex_.lock();
   program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
   program_cache_mutex_.unlock();
@@ -40,6 +46,9 @@ void StoreProgramToCache(const Program &program, const Context &context,
 // otherwise.
 const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
                                       const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Retrieving binary from cache\n");
+  #endif
   binary_cache_mutex_.lock();
   for (auto &cached_binary: binary_cache_) {
     if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
@@ -55,6 +64,9 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
 // otherwise.
 const Program& GetProgramFromCache(const Context &context, const Precision &precision,
                                    const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Retrieving program from cache\n");
+  #endif
   program_cache_mutex_.lock();
   for (auto &cached_program: program_cache_) {
     if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
diff --git a/src/cache.hpp b/src/cache.hpp
index 0d74d7bc..8f540891 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -55,7 +55,7 @@ struct ProgramCache {
   // Finds out whether the properties match
   bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
                     const std::string &ref_routine) {
-    return (context_ptr == ref_context &&
+    return (*context_ptr == *ref_context &&
             precision == ref_precision &&
             routine_name_ == ref_routine);
   }
-- 
cgit v1.2.3


From 39e9b1238ff66a680579a181c0f0c2424e65e003 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 10 Jul 2016 11:24:36 +0200
Subject: Fixed a bug related to the cache and retrieval of programs based on
 the OpenCL context

---
 src/cache.cpp | 6 +++---
 src/cache.hpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/cache.cpp b/src/cache.cpp
index 2b90eccc..6080f082 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -38,7 +38,7 @@ void StoreProgramToCache(const Program &program, const Context &context,
     printf("[DEBUG] Storing program in cache\n");
   #endif
   program_cache_mutex_.lock();
-  program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
+  program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
   program_cache_mutex_.unlock();
 }
 
@@ -69,7 +69,7 @@ const Program& GetProgramFromCache(const Context &context, const Precision &prec
   #endif
   program_cache_mutex_.lock();
   for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
+    if (cached_program.MatchInCache(context(), precision, routine_name)) {
       program_cache_mutex_.unlock();
       return cached_program.program;
     }
@@ -97,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
                       const std::string &routine_name) {
   program_cache_mutex_.lock();
   for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
+    if (cached_program.MatchInCache(context(), precision, routine_name)) {
       program_cache_mutex_.unlock();
       return true;
     }
diff --git a/src/cache.hpp b/src/cache.hpp
index 8f540891..9075da0d 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -48,14 +48,14 @@ static std::mutex binary_cache_mutex_;
 // The cache of compiled OpenCL programs, along with some meta-data
 struct ProgramCache {
   Program program;
-  ContextPointer context_ptr;
+  cl_context context;
   Precision precision;
   std::string routine_name_;
 
   // Finds out whether the properties match
-  bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
+  bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
                     const std::string &ref_routine) {
-    return (*context_ptr == *ref_context &&
+    return (context == ref_context &&
             precision == ref_precision &&
             routine_name_ == ref_routine);
   }
-- 
cgit v1.2.3


From 57f09178d89a1cf4f38a0bb338c864ed850d5470 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 10 Jul 2016 11:46:44 +0200
Subject: Added tuning results for AMD Oland and for Intel Graphics HD 530

---
 README.md                             |  6 ++++--
 src/database/kernels/copy.hpp         |  6 ++++++
 src/database/kernels/pad.hpp          |  8 +++++++-
 src/database/kernels/padtranspose.hpp |  6 ++++++
 src/database/kernels/transpose.hpp    |  6 ++++++
 src/database/kernels/xaxpy.hpp        |  6 ++++++
 src/database/kernels/xdot.hpp         |  6 ++++++
 src/database/kernels/xgemm.hpp        | 12 +++++++++---
 src/database/kernels/xgemv.hpp        |  8 +++++++-
 src/database/kernels/xger.hpp         | 16 +++++++++++-----
 10 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 5f3e6529..1b3adcb9 100644
--- a/README.md
+++ b/README.md
@@ -110,11 +110,13 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
   - Tesla K20m
   - Tesla K40m
 * AMD GPUs:
-  - Tahiti
+  - AMD Radeon R9 M370X Compute Engine
   - Hawaii
+  - Oland
   - Pitcairn
-  - Radeon R9 M370X Compute Engine
+  - Tahiti
 * Intel GPUs:
+  - HD Graphics 530
   - HD Graphics Haswell Ultrabook GT2 Mobile
   - HD Graphics Skylake ULT GT2
   - Iris
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index a76a08e7..d592f110 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -38,6 +38,7 @@ const Database::DatabaseEntry Database::CopySingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
@@ -59,6 +60,7 @@ const Database::DatabaseEntry Database::CopySingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -105,6 +107,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -120,6 +123,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -164,6 +168,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
         { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -222,6 +227,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
+        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 6bb0bc86..cd034f15 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -38,9 +38,10 @@ const Database::DatabaseEntry Database::PadSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // ARM GPUs
@@ -59,6 +60,7 @@ const Database::DatabaseEntry Database::PadSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -105,6 +107,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -126,6 +129,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
@@ -172,6 +176,7 @@ const Database::DatabaseEntry Database::PadDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -230,6 +235,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index b117f9fe..c2034c3e 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -38,6 +38,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
@@ -59,6 +60,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -105,6 +107,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
@@ -126,6 +129,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -172,6 +176,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
@@ -230,6 +235,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 8faa53f4..8e852c4b 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -38,6 +38,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -59,6 +60,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -105,6 +107,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -126,6 +129,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -166,6 +170,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
@@ -224,6 +229,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index b6ea1d7e..905ee084 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -38,6 +38,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",2} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -59,6 +60,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -105,6 +107,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
         { "Hawaii",                                          { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -126,6 +129,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
@@ -172,6 +176,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -230,6 +235,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 8670c50b..e36dd8ca 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -38,6 +38,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WGS2",32} } },
         { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
@@ -51,6 +52,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
@@ -87,6 +89,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
         { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
@@ -100,6 +103,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
@@ -136,6 +140,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",128} } },
         { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
@@ -177,6 +182,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
         { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index f2cd46d1..736f2695 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -32,6 +32,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
+        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -53,11 +54,12 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Intel accelerators
@@ -99,6 +101,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
         { "Hawaii",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Oland",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -120,11 +123,12 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Intel accelerators
@@ -166,9 +170,10 @@ const Database::DatabaseEntry Database::XgemmDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // ARM GPUs
@@ -224,6 +229,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 0f66e497..65f4b5c8 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -38,6 +38,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
@@ -52,11 +53,12 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
         { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
       }
     },
     { // Intel accelerators
@@ -98,6 +100,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
@@ -112,6 +115,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
         { "Iris",                                            { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
@@ -154,6 +158,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
@@ -205,6 +210,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index f02482a0..216925fc 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -38,9 +38,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -58,6 +59,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
@@ -92,9 +94,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+        { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",4}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -112,10 +115,11 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // NVIDIA GPUs
@@ -132,7 +136,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",4}, {"WGS2",1}, {"WPT",1} } },
       }
     },
   }
@@ -146,6 +150,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
@@ -192,9 +197,10 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+        { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
-- 
cgit v1.2.3


From c87e877bf23d2fe38a7da2898e1734a3cdeaf48c Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 10 Jul 2016 20:32:01 +0200
Subject: Now passing alpha/beta to the kernel as arguments as before fp16
 support; in case of fp16 arguments are cast on host and in kernel

---
 CHANGELOG                                |  1 +
 src/kernels/common.opencl                | 10 ++++++++++
 src/kernels/level1/xaxpy.opencl          |  8 ++++----
 src/kernels/level2/xgemv.opencl          |  8 ++++----
 src/kernels/level2/xgemv_fast.opencl     | 16 ++++++++--------
 src/kernels/level2/xger.opencl           |  4 ++--
 src/kernels/level2/xher.opencl           |  4 ++--
 src/kernels/level2/xher2.opencl          |  4 ++--
 src/kernels/level3/copy_fast.opencl      |  4 ++--
 src/kernels/level3/copy_pad.opencl       |  8 ++++----
 src/kernels/level3/transpose_fast.opencl |  4 ++--
 src/kernels/level3/transpose_pad.opencl  |  8 ++++----
 src/kernels/level3/xgemm_part2.opencl    | 24 ++++++++++++------------
 src/routines/common.hpp                  |  8 ++------
 src/routines/level1/xaxpy.cpp            |  8 ++------
 src/routines/level2/xgemv.cpp            | 10 ++--------
 src/routines/level2/xger.cpp             |  6 +-----
 src/routines/level2/xher.cpp             |  6 +-----
 src/routines/level2/xher2.cpp            |  6 +-----
 src/routines/level3/xgemm.cpp            | 10 ++--------
 src/routines/level3/xher2k.cpp           | 16 +++++-----------
 src/routines/level3/xherk.cpp            | 10 +++-------
 src/routines/level3/xsyr2k.cpp           | 13 +++----------
 src/routines/level3/xsyrk.cpp            | 10 ++--------
 src/tuning/kernels/copy_fast.cpp         |  3 +--
 src/tuning/kernels/copy_pad.cpp          |  3 +--
 src/tuning/kernels/transpose_fast.cpp    |  3 +--
 src/tuning/kernels/transpose_pad.cpp     |  3 +--
 src/tuning/kernels/xaxpy.cpp             |  3 +--
 src/tuning/kernels/xgemm.cpp             |  6 ++----
 src/tuning/kernels/xgemv.cpp             |  6 ++----
 src/tuning/kernels/xger.cpp              |  3 +--
 src/utilities.cpp                        |  8 ++++++++
 src/utilities.hpp                        |  6 ++++++
 34 files changed, 105 insertions(+), 145 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 248db397..b6e09102 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -5,6 +5,7 @@ Development version (next release)
 - Fixed memory leaks related to events not being released
 - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
 - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
+- Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
 - Added tuned parameters for various devices (see README)
 
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 08c47d87..9d2bb65e 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -109,6 +109,16 @@ R"(
   typedef real singlereal;
 #endif
 
+// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
+// conversion, but half-precision is not supported as kernel argument so it is converted from float.
+#if PRECISION == 16
+  typedef float real_arg;
+  #define GetRealArg(x) (half)x
+#else
+  typedef real real_arg;
+  #define GetRealArg(x) x
+#endif
+
 // =================================================================================================
 
 // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index e0efadc1..d533041b 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -23,10 +23,10 @@ R"(
 
 // Full version of the kernel with offsets and strided accesses
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
+__kernel void Xaxpy(const int n, const real_arg arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -41,10 +41,10 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
 __attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
+__kernel void XaxpyFast(const int n, const real_arg arg_alpha,
                         const __global realV* restrict xgm,
                         __global realV* ygm) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 65b4291f..83b6b15d 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -212,16 +212,16 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 // Full version of the kernel
 __attribute__((reqd_work_group_size(WGS1, 1, 1)))
 __kernel void Xgemv(const int m, const int n,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
+                    const real_arg arg_alpha,
+                    const real_arg arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 6a494e84..1127a0b6 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -96,16 +96,16 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 __kernel void XgemvFast(const int m, const int n,
-                        const __constant real* restrict arg_alpha,
-                        const __constant real* restrict arg_beta,
+                        const real_arg arg_alpha,
+                        const real_arg arg_beta,
                         const int a_rotated,
                         const __global realVF* restrict agm, const int a_offset, const int a_ld,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
                         const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS2];
@@ -198,16 +198,16 @@ __kernel void XgemvFast(const int m, const int n,
 // --> 'do_conjugate' is 0
 __attribute__((reqd_work_group_size(WGS3, 1, 1)))
 __kernel void XgemvFastRot(const int m, const int n,
-                           const __constant real* restrict arg_alpha,
-                           const __constant real* restrict arg_beta,
+                           const real_arg arg_alpha,
+                           const real_arg arg_beta,
                            const int a_rotated,
                            const __global realVFR* restrict agm, const int a_offset, const int a_ld,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
                            const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS3];
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index 63817afb..f218a346 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -20,12 +20,12 @@ R"(
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xger(const int max1, const int max2,
-                   const __constant real* restrict arg_alpha,
+                   const real_arg arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    const __global real* ygm, const int y_offset, const int y_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index fc635f2e..1200ee63 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -20,11 +20,11 @@ R"(
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xher(const int n,
-                   const __constant real* restrict arg_alpha,
+                   const real_arg arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
                    const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and XT
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index a66f255f..d0f41571 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -20,12 +20,12 @@ R"(
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
 __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
 __kernel void Xher2(const int n,
-                    const __constant real* restrict arg_alpha,
+                    const real_arg arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     const __global real* restrict ygm, const int y_offset, const int y_inc,
                     __global real* restrict agm, const int a_offset, const int a_ld,
                     const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 09e54e6d..dd975bf1 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -39,8 +39,8 @@ __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
 __kernel void CopyMatrixFast(const int ld,
                              __global const realC* restrict src,
                              __global realC* dest,
-                             const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+                             const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
     const int id_one = get_global_id(0);
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d276cc60..d0771c31 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -31,9 +31,9 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
                             const int dest_one, const int dest_two,
                             const int dest_ld, const int dest_offset,
                             __global real* dest,
-                            const __constant real* restrict arg_alpha,
+                            const real_arg arg_alpha,
                             const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -72,10 +72,10 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
                          __global real* dest,
-                         const __constant real* restrict arg_alpha,
+                         const real_arg arg_alpha,
                          const int upper, const int lower,
                          const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index d5c46a30..ea343533 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -40,8 +40,8 @@ __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
 __kernel void TransposeMatrixFast(const int ld,
                                   __global const realT* restrict src,
                                   __global realT* dest,
-                                  const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+                                  const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2de0c7bd..2e20d667 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -31,9 +31,9 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
                                  const int dest_one, const int dest_two,
                                  const int dest_ld, const int dest_offset,
                                  __global real* dest,
-                                 const __constant real* restrict arg_alpha,
+                                 const real_arg arg_alpha,
                                  const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -95,10 +95,10 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
                               const int dest_one, const int dest_two,
                               const int dest_ld, const int dest_offset,
                               __global real* dest,
-                              const __constant real* restrict arg_alpha,
+                              const real_arg arg_alpha,
                               const int upper, const int lower,
                               const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 42c1127c..87e28cb5 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -270,13 +270,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the upper-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
+                         const real_arg arg_alpha,
+                         const real_arg arg_beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Skip these threads if they do not contain threads contributing to the upper-triangle
   if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
@@ -310,13 +310,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the lower-triangular version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
+                         const real_arg arg_alpha,
+                         const real_arg arg_beta,
                          const __global realM* restrict agm,
                          const __global realN* restrict bgm,
                          __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Skip these threads if they do not contain threads contributing to the lower-triangle
   if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
@@ -354,13 +354,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
 // Main entry point of the kernel. This is the regular full version.
 __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
 __kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
+                    const real_arg arg_alpha,
+                    const real_arg arg_beta,
                     const __global realM* restrict agm,
                     const __global realN* restrict bgm,
                     __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Allocates workgroup-private memory (local memory)
   #if SA == 1
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index c99cd39d..e624a2b1 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -88,10 +88,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
     }
   }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context, 1);
-  alpha_buffer.Write(queue, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     auto kernel = Kernel(program, kernel_name);
@@ -101,7 +97,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
       kernel.SetArgument(0, static_cast<int>(src_ld));
       kernel.SetArgument(1, src());
       kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, alpha_buffer());
+      kernel.SetArgument(3, GetRealArg(alpha));
     }
     else {
       kernel.SetArgument(0, static_cast<int>(src_one));
@@ -114,7 +110,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
       kernel.SetArgument(7, static_cast<int>(dest_ld));
       kernel.SetArgument(8, static_cast<int>(dest_offset));
       kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, alpha_buffer());
+      kernel.SetArgument(10, GetRealArg(alpha));
       if (do_pad) {
         kernel.SetArgument(11, static_cast<int>(do_conjugate));
       }
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 5b6c9e77..3445e2b5 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -59,20 +59,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
-    // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-
     // Sets the kernel arguments
     if (use_fast_kernel) {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha_buffer());
+      kernel.SetArgument(1, GetRealArg(alpha));
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, y_buffer());
     }
     else {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha_buffer());
+      kernel.SetArgument(1, GetRealArg(alpha));
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, static_cast<int>(x_offset));
       kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 21fb397c..2842ef07 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -126,12 +126,6 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     local_size = db_["WGS3"];
   }
 
-  // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  auto beta_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-  beta_buffer.Write(queue_, 1, &beta);
-
   // Retrieves the Xgemv kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -140,8 +134,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(m_real));
     kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha_buffer());
-    kernel.SetArgument(3, beta_buffer());
+    kernel.SetArgument(2, GetRealArg(alpha));
+    kernel.SetArgument(3, GetRealArg(beta));
     kernel.SetArgument(4, static_cast<int>(a_rotated));
     kernel.SetArgument(5, a_buffer());
     kernel.SetArgument(6, static_cast<int>(a_offset));
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index 353047d2..29cffe0c 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -56,10 +56,6 @@ StatusCode Xger<T>::DoGer(const Layout layout,
   status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -68,7 +64,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(a_one));
     kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, alpha_buffer());
+    kernel.SetArgument(2, GetRealArg(alpha));
     kernel.SetArgument(3, x_buffer());
     kernel.SetArgument(4, static_cast<int>(x_offset));
     kernel.SetArgument(5, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index ed8ba9e9..6dd95938 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -70,10 +70,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
   // Creates a matching version of alpha
   const auto matching_alpha = GetAlpha(alpha);
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &matching_alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -81,7 +77,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha_buffer());
+    kernel.SetArgument(1, GetRealArg(matching_alpha));
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 50572cea..3d57a9b9 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -58,10 +58,6 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
   status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -69,7 +65,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha_buffer());
+    kernel.SetArgument(1, GetRealArg(alpha));
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 9ea5559c..97e8db7e 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -118,12 +118,6 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -174,8 +168,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       kernel.SetArgument(0, static_cast<int>(m_ceiled));
       kernel.SetArgument(1, static_cast<int>(n_ceiled));
       kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, alpha_buffer());
-      kernel.SetArgument(4, beta_buffer());
+      kernel.SetArgument(3, GetRealArg(alpha));
+      kernel.SetArgument(4, GetRealArg(beta));
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, b_temp());
       kernel.SetArgument(7, c_temp());
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index bd7a053e..65e2be55 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -107,12 +107,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    // Convert the arguments to complex versions
     auto complex_beta = T{beta, static_cast<U>(0.0)};
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &complex_beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -180,8 +176,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(complex_beta));
       kernel.SetArgument(4, a1_temp());
       kernel.SetArgument(5, b2_temp());
       kernel.SetArgument(6, c_temp());
@@ -202,10 +198,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
       auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
       auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      alpha_buffer.Write(queue_, 1, &conjugate_alpha);
-      beta_buffer.Write(queue_, 1, &complex_one);
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(conjugate_alpha));
+      kernel.SetArgument(3, GetRealArg(complex_one));
       kernel.SetArgument(4, b1_temp());
       kernel.SetArgument(5, a2_temp());
 
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 6ef7f21f..cc87e3e9 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -98,13 +98,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    // Convert the arguments to complex versions
     auto complex_alpha = T{alpha, static_cast<U>(0.0)};
     auto complex_beta = T{beta, static_cast<U>(0.0)};
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &complex_alpha);
-    beta_buffer.Write(queue_, 1, &complex_beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -152,8 +148,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(complex_alpha));
+      kernel.SetArgument(3, GetRealArg(complex_beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 424d4d2d..18a1eac7 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -97,12 +97,6 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -149,8 +143,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
@@ -170,8 +164,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
 
       // Swaps the arguments for matrices A and B, and sets 'beta' to 1
       auto one = static_cast<T>(1);
-      beta_buffer.Write(queue_, 1, &one);
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(3, GetRealArg(one));
       kernel.SetArgument(4, b_temp());
       kernel.SetArgument(5, a_temp());
 
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index f56c232b..1992cec1 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -90,12 +90,6 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -132,8 +126,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, c_temp());
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 34269bc7..78ded56e 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -86,11 +86,10 @@ class TuneCopy {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 1e0dccd3..90f5ea82 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -86,7 +86,6 @@ class TunePad {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -97,7 +96,7 @@ class TunePad {
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 7ac19cb6..10fa80cb 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -91,11 +91,10 @@ class TuneTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 63274415..507718eb 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -90,7 +90,6 @@ class TunePadTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -101,7 +100,7 @@ class TunePadTranspose {
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 88d12c1f..0033b3c6 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -89,9 +89,8 @@ class TuneXaxpy {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentOutput(y_vec);
   }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4b1efdef..898b8435 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -121,13 +121,11 @@ class TuneXgemm {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
-    auto beta_buffer = std::vector<T>{args.beta};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentInput(alpha_buffer);
-    tuner.AddArgumentInput(beta_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentInput(b_mat);
     tuner.AddArgumentOutput(c_mat);
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index d42155ae..5c187d33 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -96,13 +96,11 @@ class TuneXgemv {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
-    auto beta_buffer = std::vector<T>{args.beta};
     auto a_rotated = (V==3) ? 1 : 0;
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
-    tuner.AddArgumentInput(beta_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
     tuner.AddArgumentScalar(static_cast<int>(a_rotated));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentScalar(0);
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index d2590c53..1fb5c531 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -85,10 +85,9 @@ class TuneXger {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentScalar(0); // x_offset
     tuner.AddArgumentScalar(1); // x_increment
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 68e480c5..11a6c439 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -332,6 +332,14 @@ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_com
   result.Write(queue, size, result_cpu);
 }
 
+// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
+// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
+template <> typename RealArg<half>::Type GetRealArg(const half value) { return HalfToFloat(value); }
+template <> typename RealArg<float>::Type GetRealArg(const float value) { return value; }
+template <> typename RealArg<double>::Type GetRealArg(const double value) { return value; }
+template <> typename RealArg<float2>::Type GetRealArg(const float2 value) { return value; }
+template <> typename RealArg<double2>::Type GetRealArg(const double2 value) { return value; }
+
 // =================================================================================================
 
 // Rounding functions performing ceiling and division operations
diff --git a/src/utilities.hpp b/src/utilities.hpp
index d5efab9f..700d30d6 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -227,6 +227,12 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour
 Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
 void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
 
+// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
+// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
+template <typename T> struct RealArg { using Type = T; };
+template <> struct RealArg<half> { using Type = float; };
+template <typename T> typename RealArg<T>::Type GetRealArg(const T value);
+
 // =================================================================================================
 
 // Rounding functions
-- 
cgit v1.2.3


From e0ba59c0ac964340b3706233862a2ca98e345823 Mon Sep 17 00:00:00 2001
From: Gian-Carlo Pascutto <gcp@sjeng.org>
Date: Mon, 11 Jul 2016 09:24:40 +0200
Subject: Make sure the passed types are large enough.

Make sure all out parameters that are passed to functions such
as clGetDeviceInfo are large enough to contain the replies.
---
 src/clpp11.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index fcb71e38..9f924a4f 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -194,7 +194,7 @@ class Device {
   }
   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
   size_t MaxWorkItemDimensions() const {
-    return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+    return GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
   }
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
@@ -205,8 +205,8 @@ class Device {
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
   size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
-  size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
-  size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
+  size_t MemorySize() const { return GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE); }
+  size_t MaxAllocSize() const { return GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
 
@@ -250,7 +250,7 @@ class Device {
   size_t GetInfo(const cl_device_info info) const {
     auto bytes = size_t{0};
     CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
-    auto result = cl_uint(0);
+    auto result = cl_ulong(0);
     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
     return static_cast<size_t>(result);
   }
-- 
cgit v1.2.3


From 066af4069ba5c92decc7652e5c6d36c27849ccab Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 16 Jul 2016 10:56:37 +0200
Subject: Removed an unused variable from the copy-transpose-pad function

---
 src/routines/common.hpp           |  2 +-
 src/routines/level3/xgemm.cpp     |  8 ++++----
 src/routines/level3/xher2k.cpp    | 12 ++++++------
 src/routines/level3/xherk.cpp     |  8 ++++----
 src/routines/level3/xsyr2k.cpp    |  8 ++++----
 src/routines/level3/xsyrk.cpp     |  6 +++---
 src/routines/levelx/xomatcopy.cpp |  2 +-
 7 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index e624a2b1..d53bdc25 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -41,7 +41,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
+StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
                                   const Database &db,
                                   EventPointer event, std::vector<Event>& waitForEvents,
                                   const size_t src_one, const size_t src_two,
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 97e8db7e..0db28537 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -127,7 +127,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -139,7 +139,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // As above, but now for matrix B
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       b_one, b_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                       ConstantOne<T>(), program,
@@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // As above, but now for matrix C. This is only necessary if C is used both as input and output.
     if (!c_no_temp && beta != static_cast<T>(0)) {
       auto eventProcessC = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                       c_one, c_two, c_ld, c_offset, c_buffer,
                                       m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
                                       ConstantOne<T>(), program,
@@ -190,7 +190,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       // Runs the post-processing kernel if needed
       if (!c_no_temp) {
         eventWaitList.push_back(eventKernel);
-        status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+        status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                         m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
                                         c_one, c_two, c_ld, c_offset, c_buffer,
                                         ConstantOne<T>(), program,
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index 65e2be55..1ba6080f 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -119,7 +119,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     // case nothing has to be done, these kernels can be skipped.
     if (!a1_no_temp) {
       auto eventProcessA1 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
                                       ConstantOne<T>(), program,
@@ -129,7 +129,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     }
     if (!a2_no_temp) {
       auto eventProcessA2 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
                                       ConstantOne<T>(), program,
@@ -139,7 +139,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     }
     if (!b1_no_temp) {
       auto eventProcessB1 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
                                       ConstantOne<T>(), program,
@@ -149,7 +149,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     }
     if (!b2_no_temp) {
       auto eventProcessB2 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
                                       ConstantOne<T>(), program,
@@ -161,7 +161,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -212,7 +212,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index cc87e3e9..0fa1b7b1 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -111,7 +111,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     // case nothing has to be done, these kernels can be skipped. Two copies are created.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -121,7 +121,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     }
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                       ConstantOne<T>(), program,
@@ -133,7 +133,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -170,7 +170,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 18a1eac7..5a90a5a2 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -106,7 +106,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -116,7 +116,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     }
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                       ConstantOne<T>(), program,
@@ -128,7 +128,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -177,7 +177,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 1992cec1..46b96b76 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -99,7 +99,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -111,7 +111,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -148,7 +148,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
index e8593301..af9080af 100644
--- a/src/routines/levelx/xomatcopy.cpp
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -72,7 +72,7 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans
   const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
   auto emptyEventList = std::vector<Event>();
-  status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList,
+  status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
                                   a_one, a_two, a_ld, a_offset, a_buffer,
                                   b_one, b_two, b_ld, b_offset, b_buffer,
                                   alpha, program, false, transpose, conjugate);
-- 
cgit v1.2.3


From b33bec4a59d9d4d0b2e6a3d7e5f1d6e23d4279cb Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 16 Jul 2016 11:13:23 +0200
Subject: Fixed some more types and type conversions in the clpp11 interface to
 OpenCL

---
 src/clpp11.hpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 9f924a4f..af9d2ea4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -194,7 +194,7 @@ class Device {
   }
   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
   size_t MaxWorkItemDimensions() const {
-    return GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
   }
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
@@ -203,10 +203,18 @@ class Device {
     return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
-  size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
-  size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
-  size_t MemorySize() const { return GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE); }
-  size_t MaxAllocSize() const { return GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
+  size_t CoreClock() const {
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
+  }
+  size_t ComputeUnits() const {
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
+  }
+  unsigned long MemorySize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
+  }
+  unsigned long MaxAllocSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
+  }
   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
 
@@ -247,13 +255,6 @@ class Device {
     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
     return result;
   }
-  size_t GetInfo(const cl_device_info info) const {
-    auto bytes = size_t{0};
-    CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
-    auto result = cl_ulong(0);
-    CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
-    return static_cast<size_t>(result);
-  }
   template <typename T>
   std::vector<T> GetInfoVector(const cl_device_info info) const {
     auto bytes = size_t{0};
-- 
cgit v1.2.3


From a1d80e7402d25a4db73d028f4acba660e3b6f19e Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Thu, 7 Jul 2016 05:07:59 +0300
Subject: CMakeLists.txt: use ${clblast_SOURCE_DIR} instead of
 ${CMAKE_SOURCE_DIR}

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77d1cd08..95d1d500 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,7 +95,7 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}")
 # ==================================================================================================
 
 # Package scripts location
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${clblast_SOURCE_DIR}/cmake/Modules/")
 
 # Requires OpenCL. It is found through the included "FindOpenCL.cmake" in CMAKE_MODULE_PATH.
 find_package(OpenCL REQUIRED)
-- 
cgit v1.2.3


From 1ae71614ac227e9b80678505fcb5ecc18a3d3383 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sun, 10 Jul 2016 23:11:27 +0300
Subject: xgemm: do not hardcode kernel requirements for internal matrix layout

Do not hardcode the knowledge about "A and C col-major, B row-major".

This allows for easier reuse of the DoGemm() routine with different
kernels.
---
 src/routines/level3/xgemm.cpp | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0db28537..fce59622 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -63,9 +63,12 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
                          (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
   const auto c_rotated = (layout == Layout::kRowMajor);
-  const auto a_do_transpose =  a_rotated;
-  const auto b_do_transpose = !b_rotated;
-  const auto c_do_transpose =  c_rotated;
+  static const auto a_want_rotated = false;
+  static const auto b_want_rotated = true;
+  static const auto c_want_rotated = false;
+  const auto a_do_transpose = a_rotated != a_want_rotated;
+  const auto b_do_transpose = b_rotated != b_want_rotated;
+  const auto c_do_transpose = c_rotated != c_want_rotated;
 
   // In case of complex data-types, the transpose can also become a conjugate transpose
   const auto a_conjugate = (a_transpose == Transpose::kConjugate);
@@ -99,6 +102,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   const auto n_ceiled = Ceil(n, db_["NWG"]);
   const auto k_ceiled = Ceil(k, db_["KWG"]);
 
+  // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account
+  // whether the matrices need to be rotated or not for the kernel.
+  const auto a_one_i = (a_want_rotated) ? k_ceiled : m_ceiled;
+  const auto a_two_i = (a_want_rotated) ? m_ceiled : k_ceiled;
+  const auto b_one_i = (b_want_rotated) ? n_ceiled : k_ceiled;
+  const auto b_two_i = (b_want_rotated) ? k_ceiled : n_ceiled;
+  const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
+  const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
+
   // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
   try {
 
@@ -106,17 +118,17 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
-    auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
+    auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
                      a_do_transpose == false && a_conjugate == false;
-    auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+    auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
                      b_do_transpose == false && b_conjugate == false;
-    auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
+    auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
                      c_do_transpose == false;
 
     // Creates the temporary matrices
-    const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled);
-    const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
+    const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
+    const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
+    const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -129,7 +141,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       auto eventProcessA = Event();
       status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
-                                      m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
+                                      a_one_i, a_two_i, a_one_i, 0, a_temp,
                                       ConstantOne<T>(), program,
                                       true, a_do_transpose, a_conjugate);
       if (ErrorIn(status)) { return status; }
@@ -141,7 +153,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       auto eventProcessB = Event();
       status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       b_one, b_two, b_ld, b_offset, b_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      b_one_i, b_two_i, b_one_i, 0, b_temp,
                                       ConstantOne<T>(), program,
                                       true, b_do_transpose, b_conjugate);
       if (ErrorIn(status)) { return status; }
@@ -153,7 +165,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       auto eventProcessC = Event();
       status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                       c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                      c_one_i, c_two_i, c_one_i, 0, c_temp,
                                       ConstantOne<T>(), program,
                                       true, c_do_transpose, false);
       if (ErrorIn(status)) { return status; }
@@ -176,8 +188,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
 
       // Computes the global and local thread sizes
       const auto global = std::vector<size_t>{
-        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+        (c_one_i * db_["MDIMC"]) / db_["MWG"],
+        (c_two_i * db_["NDIMC"]) / db_["NWG"]
       };
       const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
 
@@ -191,7 +203,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       if (!c_no_temp) {
         eventWaitList.push_back(eventKernel);
         status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                                        m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                        c_one_i, c_two_i, c_one_i, 0, c_temp,
                                         c_one, c_two, c_ld, c_offset, c_buffer,
                                         ConstantOne<T>(), program,
                                         false, c_do_transpose, false);
-- 
cgit v1.2.3


From 2dd5ee3f758f817238cf74c068c5cad6eb3d46dd Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Mon, 11 Jul 2016 22:36:39 +0300
Subject: clblast::RunKernel, cl::Kernel: take const vector as waitForEvents

---
 src/clpp11.hpp          | 4 +++-
 src/routines/common.cpp | 2 +-
 src/routines/common.hpp | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index af9d2ea4..9fa683ac 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -109,7 +109,9 @@ class Event {
 
   // Accessor to the private data-member
   cl_event& operator()() { return *event_; }
+  const cl_event& operator()() const { return *event_; }
   cl_event* pointer() { return &(*event_); }
+  const cl_event* pointer() const { return &(*event_); }
  private:
   std::shared_ptr<cl_event> event_;
 };
@@ -686,7 +688,7 @@ class Kernel {
   // As above, but with an event waiting list
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
-              std::vector<Event>& waitForEvents) {
+              const std::vector<Event> &waitForEvents) {
     if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
 
     // Builds a plain version of the events waiting list
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 2e82e04d..21e16954 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -22,7 +22,7 @@ namespace clblast {
 // Enqueues a kernel, waits for completion, and checks for errors
 StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                      std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, std::vector<Event>& waitForEvents) {
+                     EventPointer event, const std::vector<Event> &waitForEvents) {
 
   // Tests for validity of the local thread sizes
   if (local.size() > device.MaxWorkItemDimensions()) {
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index d53bdc25..d0bbc707 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -29,7 +29,7 @@ namespace clblast {
 // Enqueues a kernel, waits for completion, and checks for errors
 StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                      std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, std::vector<Event>& waitForEvents);
+                     EventPointer event, const std::vector<Event> &waitForEvents);
 
 // As above, but without an event waiting list
 StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
@@ -43,7 +43,7 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 template <typename T>
 StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
                                   const Database &db,
-                                  EventPointer event, std::vector<Event>& waitForEvents,
+                                  EventPointer event, const std::vector<Event> &waitForEvents,
                                   const size_t src_one, const size_t src_two,
                                   const size_t src_ld, const size_t src_offset,
                                   const Buffer<T> &src,
-- 
cgit v1.2.3


From 5502c5eec4c7b56c3b44ef04046d1621d58be47f Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Mon, 11 Jul 2016 22:55:58 +0300
Subject: cl::Kernel: skip NULL entries in waitForEvents

---
 src/clpp11.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index 9fa683ac..debfff09 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -689,14 +689,14 @@ class Kernel {
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               const std::vector<Event> &waitForEvents) {
-    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
-
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
-      waitForEventsPlain.push_back(waitEvent());
+      if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
     }
 
+    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
+
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
                                       nullptr, global.data(), local.data(),
-- 
cgit v1.2.3


From ae3299da302ba2c26f1e3490a1a7ee389d91feee Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Mon, 11 Jul 2016 23:14:43 +0300
Subject: clblast::RunKernel, cl::Kernel: unify variants with/without
 waitForEvents, support empty LWS

---
 src/clpp11.hpp          | 13 ++-----------
 src/routines/common.cpp | 38 ++++++++++++++++----------------------
 src/routines/common.hpp |  7 +------
 3 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index debfff09..d57223dd 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -695,23 +695,14 @@ class Kernel {
       if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
     }
 
-    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
-
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
-                                      nullptr, global.data(), local.data(),
+                                      nullptr, global.data(), !local.empty() ? local.data() : nullptr,
                                       static_cast<cl_uint>(waitForEventsPlain.size()),
-                                      waitForEventsPlain.data(),
+                                      !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
                                       event));
   }
 
-  // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
-    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
-                                      nullptr, global.data(), nullptr,
-                                      0, nullptr, event));
-  }
-
   // Accessor to the private data-member
   const cl_kernel& operator()() const { return *kernel_; }
  private:
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index 21e16954..3969cf9f 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -24,21 +24,23 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                      std::vector<size_t> global, const std::vector<size_t> &local,
                      EventPointer event, const std::vector<Event> &waitForEvents) {
 
-  // Tests for validity of the local thread sizes
-  if (local.size() > device.MaxWorkItemDimensions()) {
-    return StatusCode::kInvalidLocalNumDimensions; 
-  }
-  const auto max_work_item_sizes = device.MaxWorkItemSizes();
-  for (auto i=size_t{0}; i<local.size(); ++i) {
-    if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
-  }
-  auto local_size = size_t{1};
-  for (auto &item: local) { local_size *= item; }
-  if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+  if (!local.empty()) {
+    // Tests for validity of the local thread sizes
+    if (local.size() > device.MaxWorkItemDimensions()) {
+      return StatusCode::kInvalidLocalNumDimensions;
+    }
+    const auto max_work_item_sizes = device.MaxWorkItemSizes();
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+    }
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
 
-  // Make sure the global thread sizes are at least equal to the local sizes
-  for (auto i=size_t{0}; i<global.size(); ++i) {
-    if (global[i] < local[i]) { global[i] = local[i]; }
+    // Make sure the global thread sizes are at least equal to the local sizes
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      if (global[i] < local[i]) { global[i] = local[i]; }
+    }
   }
 
   // Tests for local memory usage
@@ -69,13 +71,5 @@ StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
   return StatusCode::kSuccess;
 }
 
-// As above, but without an event waiting list
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event) {
-  auto emptyWaitingList = std::vector<Event>();
-  return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
-}
-
 // =================================================================================================
 } // namespace clblast
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index d0bbc707..9d8849c3 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -29,12 +29,7 @@ namespace clblast {
 // Enqueues a kernel, waits for completion, and checks for errors
 StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                      std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, const std::vector<Event> &waitForEvents);
-
-// As above, but without an event waiting list
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event);
+                     EventPointer event, const std::vector<Event> &waitForEvents = {});
 
 // =================================================================================================
 
-- 
cgit v1.2.3


From e4e1f05079273f60f4f15280b3f103810c7eb31f Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Tue, 12 Jul 2016 13:33:25 +0300
Subject: clblast::Database, clblast::Routine: implement "database overlays"
 provided by routine implementation

---
 src/database/database.cpp | 45 ++++++++++++++++++++++++++-------------------
 src/database/database.hpp | 10 ++++++++--
 src/routine.cpp           |  5 +++--
 src/routine.hpp           |  3 ++-
 4 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/src/database/database.cpp b/src/database/database.cpp
index 6ec93731..ea1557b9 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -44,7 +44,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
 
 // Constructor, computing device properties and populating the parameter-vector from the database
 Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision):
+                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
   parameters_{} {
 
   // Finds information of the current device
@@ -53,10 +53,23 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
   auto device_vendor = device.Vendor();
   auto device_name = device.Name();
 
+  // Set the short vendor name
+  for (auto &combination : kVendorNames) {
+    if (device_vendor == combination.first) {
+      device_vendor = combination.second;
+    }
+  }
+
   // Iterates over all kernels to include, and retrieves the parameters for each of them
   for (auto &kernel: kernels) {
-    auto search_result = Search(kernel, device_type, device_vendor, device_name, precision);
-    parameters_.insert(search_result.begin(), search_result.end());
+    auto search_result = ParametersPtr{};
+
+    for (auto db: { &overlay, &database }) {
+      search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
+      if (search_result) { parameters_.insert(search_result->begin(), search_result->end()); break; }
+    }
+
+    if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
   }
 }
 
@@ -74,27 +87,21 @@ std::string Database::GetDefines() const {
 // =================================================================================================
 
 // Searches the database for the right kernel and precision
-Database::Parameters Database::Search(const std::string &this_kernel,
-                                      const std::string &this_type,
-                                      const std::string &this_vendor,
-                                      const std::string &this_device,
-                                      const Precision this_precision) const {
-  // Set the short vendor name
-  auto this_short_vendor = this_vendor;
-  for (auto &combination : kVendorNames) {
-    if (this_vendor == combination.first) {
-      this_short_vendor = combination.second;
-    }
-  }
+Database::ParametersPtr Database::Search(const std::string &this_kernel,
+                                         const std::string &this_type,
+                                         const std::string &this_vendor,
+                                         const std::string &this_device,
+                                         const Precision this_precision,
+                                         const std::vector<DatabaseEntry> &this_database) const {
 
   // Selects the right kernel
-  for (auto &db: database) {
+  for (auto &db: this_database) {
     if (db.kernel == this_kernel && db.precision == this_precision) {
 
       // Searches for the right vendor and device type, or selects the default if unavailable. This
       // assumes that the default vendor / device type is last in the database.
       for (auto &vendor: db.vendors) {
-        if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
+        if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
             (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
 
           // Searches for the right device. If the current device is unavailable, selects the vendor
@@ -104,7 +111,7 @@ Database::Parameters Database::Search(const std::string &this_kernel,
             if (device.name == this_device || device.name == "default") {
 
               // Sets the parameters accordingly
-              return device.parameters;
+              return &device.parameters;
             }
           }
         }
@@ -113,7 +120,7 @@ Database::Parameters Database::Search(const std::string &this_kernel,
   }
 
   // If we reached this point, something is wrong
-  throw std::runtime_error("Database error, could not find a suitable entry");
+  return nullptr;
 }
 
 // =================================================================================================
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 0987cbed..5a61fad9 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -32,6 +32,7 @@ class Database {
 
   // Type alias for the database parameters
   using Parameters = std::unordered_map<std::string,size_t>;
+  using ParametersPtr = const Parameters*;
 
   // Structures for content inside the database
   struct DatabaseDevice {
@@ -78,9 +79,9 @@ class Database {
   static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
   static const std::vector<DatabaseEntry> database;
 
-  // The constructor
+  // The constructor with a user-provided database overlay
   explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision);
+                    const Precision precision, const std::vector<DatabaseEntry> &overlay);
 
   // Accessor of values by key
   size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@@ -93,6 +94,11 @@ class Database {
                     const std::string &this_vendor, const std::string &this_device,
                     const Precision this_precision) const;
 
+  // Alternate search method in a specified database, returning pointer (possibly NULL)
+  ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
+                       const std::string &this_vendor, const std::string &this_device,
+                       const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
+
   // Found parameters suitable for this device/kernel
   Parameters parameters_;
 };
diff --git a/src/routine.cpp b/src/routine.cpp
index 3c3343da..189ae190 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -22,7 +22,8 @@ namespace clblast {
 
 // Constructor: not much here, because no status codes can be returned
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
-                 const std::vector<std::string> &routines, const Precision precision):
+                 const std::vector<std::string> &routines, const Precision precision,
+                 const std::vector<Database::DatabaseEntry> &userDatabase):
     precision_(precision),
     routine_name_(name),
     queue_(queue),
@@ -30,7 +31,7 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
     context_(queue_.GetContext()),
     device_(queue_.GetDevice()),
     device_name_(device_.Name()),
-    db_(queue_, routines, precision_) {
+    db_(queue_, routines, precision_, userDatabase) {
 }
 
 // =================================================================================================
diff --git a/src/routine.hpp b/src/routine.hpp
index 54b5779f..21506e7b 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -34,7 +34,8 @@ class Routine {
 
   // Base class constructor
   explicit Routine(Queue &queue, EventPointer event, const std::string &name,
-                   const std::vector<std::string> &routines, const Precision precision);
+                   const std::vector<std::string> &routines, const Precision precision,
+                   const std::vector<Database::DatabaseEntry> &userDatabase = {});
 
   // Set-up phase of the kernel
   StatusCode SetUp();
-- 
cgit v1.2.3


From 75fe8235f78520fbbfff7c9c035ecd9f1aa3e6f6 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 23 Jul 2016 10:20:11 +0200
Subject: Improved the XgemvFastRot kernel by tiled loading of the input matrix
 A, enabling better memory performance

---
 src/database/kernels/xgemv.hpp       |   2 +-
 src/kernels/level2/xgemv_fast.opencl | 137 +++++++++++++++++------------------
 src/routines/level2/xgemv.cpp        |   2 +-
 src/tuning/kernels/xgemv.cpp         |  20 +++--
 4 files changed, 85 insertions(+), 76 deletions(-)

diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 65f4b5c8..6fb68858 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -36,7 +36,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Oland",                                           { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 1127a0b6..1d34de96 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -38,7 +38,7 @@ R"(
   #define WGS3 64     // The local work-group size
 #endif
 #ifndef WPT3
-  #define WPT3 1      // The amount of work-per-thread
+  #define WPT3 1      // The tile-size
 #endif
 #ifndef VW3
   #define VW3 1       // Vector width of matrix A loads
@@ -74,18 +74,12 @@ R"(
 
 // =================================================================================================
 
-// Loads a vector input value (1/2)
+// Loads a vector input value
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                             const int a_ld) {
   return agm[a_ld*y + x];
 }
 
-// Loads a vector input value (2/2): as before, but different data-type
-inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
-                              const int a_ld) {
-  return agm[a_ld*y + x];
-}
-
 // =================================================================================================
 
 // Faster version of the kernel, assuming that:
@@ -110,7 +104,7 @@ __kernel void XgemvFast(const int m, const int n,
   // Local memory for the vector X
   __local real xlm[WGS2];
 
-  // Initializes the accumulation register
+  // Initializes the accumulation registers
   real acc[WPT2];
   #pragma unroll
   for (int w=0; w<WPT2; ++w) {
@@ -134,7 +128,7 @@ __kernel void XgemvFast(const int m, const int n,
       #pragma unroll
       for (int w=0; w<WPT2/VW2; ++w) {
         const int gid = (WPT2/VW2)*get_global_id(0) + w;
-        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
+        realVF avec = agm[(a_ld/VW2)*k + gid];
         #if VW2 == 1
           MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
         #elif VW2 == 2
@@ -209,72 +203,80 @@ __kernel void XgemvFastRot(const int m, const int n,
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[WGS3 * WPT3];
+  const int lid = get_local_id(0);
+  const int lid_mod = lid % WPT3;
+  const int lid_div = lid / WPT3;
+
   // Local memory for the vector X
-  __local real xlm[WGS3];
+  __local real xlm[WPT3];
 
   // Initializes the accumulation register
-  real acc[WPT3];
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    SetToZero(acc[w]);
-  }
+  real acc;
+  SetToZero(acc);
 
   // Loops over work-group sized portions of the work
-  for (int kwg=0; kwg<n; kwg+=WGS3) {
+  for (int kwg=0; kwg<n; kwg+=WPT3) {
 
     // Loads the vector X into local memory
-    const int lid = get_local_id(0);
-    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+    xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+
+    // Loads the matrix A into local memory
+    #pragma unroll
+    for (int kl=0; kl<WPT3/VW3; ++kl) {
+      const int x = (kwg/VW3) + kl;
+      const int y = get_group_id(0) * WGS3 + lid;
+      realVFR avec = agm[(a_ld/VW3) * y + x];
+      #if VW3 == 1
+        tile[(kl*VW3 + 0) * WGS3 + lid] = avec;
+      #elif VW3 == 2
+        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x;
+        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y;
+      #elif VW3 == 4
+        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x;
+        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y;
+        tile[(kl*VW3 + 2) * WGS3 + lid] = avec.z;
+        tile[(kl*VW3 + 3) * WGS3 + lid] = avec.w;
+      #elif VW3 == 8
+        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0;
+        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1;
+        tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2;
+        tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3;
+        tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4;
+        tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5;
+        tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6;
+        tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7;
+      #elif VW3 == 16
+        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0;
+        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1;
+        tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2;
+        tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3;
+        tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4;
+        tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5;
+        tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6;
+        tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7;
+        tile[(kl*VW3 + 8) * WGS3 + lid] = avec.s8;
+        tile[(kl*VW3 + 9) * WGS3 + lid] = avec.s9;
+        tile[(kl*VW3 + 10) * WGS3 + lid] = avec.sA;
+        tile[(kl*VW3 + 11) * WGS3 + lid] = avec.sB;
+        tile[(kl*VW3 + 12) * WGS3 + lid] = avec.sC;
+        tile[(kl*VW3 + 13) * WGS3 + lid] = avec.sD;
+        tile[(kl*VW3 + 14) * WGS3 + lid] = avec.sE;
+        tile[(kl*VW3 + 15) * WGS3 + lid] = avec.sF;
+      #endif
+    }
 
     // Synchronizes all threads in a workgroup
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // The multiply-add function (rotated)
     #pragma unroll
-    for (int kl=0; kl<WGS3/VW3; ++kl) {
-      const int k = (kwg/VW3) + kl;
-      #pragma unroll
-      for (int w=0; w<WPT3; ++w) {
-        const int gid = WPT3*get_global_id(0) + w;
-        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
-        #if VW3 == 1
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
-        #elif VW3 == 2
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-        #elif VW3 == 4
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
-        #elif VW3 == 8
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-        #elif VW3 == 16
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
-          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
-          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
-          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
-          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
-          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
-          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
-          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
-        #endif
-      }
+    for (int kl=0; kl<WPT3; ++kl) {
+      const int k = kl * (WGS3/WPT3) + lid_div;
+      real aval = tile[k * WPT3 + lid_mod];
+      real xval = xlm[kl];
+      MultiplyAdd(acc, xval, aval);
     }
 
     // Synchronizes all threads in a workgroup
@@ -282,12 +284,9 @@ __kernel void XgemvFastRot(const int m, const int n,
   }
 
   // Stores the final result
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    const int gid = WPT3*get_global_id(0) + w;
-    real yval = ygm[gid*y_inc + y_offset];
-    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
-  }
+  const int gid = get_global_id(0);
+  real yval = ygm[gid * y_inc + y_offset];
+  AXPBY(ygm[gid * y_inc + y_offset], alpha, acc, beta, yval);
 }
 
 // =================================================================================================
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 2842ef07..e4d407c8 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -122,7 +122,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
   }
   if (fast_kernel_rot) {
     kernel_name = "XgemvFastRot";
-    global_size = m_real / db_["WPT3"];
+    global_size = m_real;
     local_size = db_["WGS3"];
   }
 
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 5c187d33..b69e4352 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -61,8 +61,9 @@ class TuneXgemv {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
-    tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+    tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
+    if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); }
+    else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); }
     if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
   }
 
@@ -74,8 +75,14 @@ class TuneXgemv {
     }
   }
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
+    if (V==1 || V==2) {
+      auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
+      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
+    }
+    else {
+      auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
+      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+    }
   }
 
   // Sets the base thread configuration
@@ -89,7 +96,10 @@ class TuneXgemv {
   static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; }
   static TransformVector DivLocal() { return {}; }
   static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; }
+  static TransformVector DivGlobal() {
+    if (V==1 || V==2) return {{"WPT"+std::to_string(V)}};
+    return {};
+  }
 
   // Sets the kernel's arguments
   static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
-- 
cgit v1.2.3


From 7a4f9637639ce83191bc2d6e8485f9a9dfd949af Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 23 Jul 2016 14:52:32 +0200
Subject: Further improvements to the XgemvFastRot kernel, properly enables
 coalescing now

---
 src/database/kernels/xgemv.hpp       |  2 +-
 src/kernels/level2/xgemv_fast.opencl | 84 ++++++++++++++++++------------------
 src/tuning/kernels/xgemv.cpp         |  8 +++-
 3 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 6fb68858..3aa1863f 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -36,7 +36,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",32}, {"WPT3",32} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
         { "Oland",                                           { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 1d34de96..359c3770 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -204,10 +204,10 @@ __kernel void XgemvFastRot(const int m, const int n,
   const real beta = GetRealArg(arg_beta);
 
   // Local memory to store a tile of the matrix (for coalescing)
-  __local real tile[WGS3 * WPT3];
+  __local real tile[WPT3][WGS3];
   const int lid = get_local_id(0);
-  const int lid_mod = lid % WPT3;
-  const int lid_div = lid / WPT3;
+  const int lid_mod = lid % (WPT3/VW3);
+  const int lid_div = lid / (WPT3/VW3);
 
   // Local memory for the vector X
   __local real xlm[WPT3];
@@ -225,45 +225,45 @@ __kernel void XgemvFastRot(const int m, const int n,
     // Loads the matrix A into local memory
     #pragma unroll
     for (int kl=0; kl<WPT3/VW3; ++kl) {
-      const int x = (kwg/VW3) + kl;
-      const int y = get_group_id(0) * WGS3 + lid;
+      const int x = (kwg/VW3) + lid_mod;
+      const int y = get_group_id(0) * WGS3 + lid_div * (WPT3/VW3) + kl;
       realVFR avec = agm[(a_ld/VW3) * y + x];
       #if VW3 == 1
-        tile[(kl*VW3 + 0) * WGS3 + lid] = avec;
+        tile[kl*VW3 + 0][lid] = avec;
       #elif VW3 == 2
-        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x;
-        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y;
+        tile[kl*VW3 + 0][lid] = avec.x;
+        tile[kl*VW3 + 1][lid] = avec.y;
       #elif VW3 == 4
-        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.x;
-        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.y;
-        tile[(kl*VW3 + 2) * WGS3 + lid] = avec.z;
-        tile[(kl*VW3 + 3) * WGS3 + lid] = avec.w;
+        tile[kl*VW3 + 0][lid] = avec.x;
+        tile[kl*VW3 + 1][lid] = avec.y;
+        tile[kl*VW3 + 2][lid] = avec.z;
+        tile[kl*VW3 + 3][lid] = avec.w;
       #elif VW3 == 8
-        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0;
-        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1;
-        tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2;
-        tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3;
-        tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4;
-        tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5;
-        tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6;
-        tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7;
+        tile[kl*VW3 + 0][lid] = avec.s0;
+        tile[kl*VW3 + 1][lid] = avec.s1;
+        tile[kl*VW3 + 2][lid] = avec.s2;
+        tile[kl*VW3 + 3][lid] = avec.s3;
+        tile[kl*VW3 + 4][lid] = avec.s4;
+        tile[kl*VW3 + 5][lid] = avec.s5;
+        tile[kl*VW3 + 6][lid] = avec.s6;
+        tile[kl*VW3 + 7][lid] = avec.s7;
       #elif VW3 == 16
-        tile[(kl*VW3 + 0) * WGS3 + lid] = avec.s0;
-        tile[(kl*VW3 + 1) * WGS3 + lid] = avec.s1;
-        tile[(kl*VW3 + 2) * WGS3 + lid] = avec.s2;
-        tile[(kl*VW3 + 3) * WGS3 + lid] = avec.s3;
-        tile[(kl*VW3 + 4) * WGS3 + lid] = avec.s4;
-        tile[(kl*VW3 + 5) * WGS3 + lid] = avec.s5;
-        tile[(kl*VW3 + 6) * WGS3 + lid] = avec.s6;
-        tile[(kl*VW3 + 7) * WGS3 + lid] = avec.s7;
-        tile[(kl*VW3 + 8) * WGS3 + lid] = avec.s8;
-        tile[(kl*VW3 + 9) * WGS3 + lid] = avec.s9;
-        tile[(kl*VW3 + 10) * WGS3 + lid] = avec.sA;
-        tile[(kl*VW3 + 11) * WGS3 + lid] = avec.sB;
-        tile[(kl*VW3 + 12) * WGS3 + lid] = avec.sC;
-        tile[(kl*VW3 + 13) * WGS3 + lid] = avec.sD;
-        tile[(kl*VW3 + 14) * WGS3 + lid] = avec.sE;
-        tile[(kl*VW3 + 15) * WGS3 + lid] = avec.sF;
+        tile[kl*VW3 + 0][lid] = avec.s0;
+        tile[kl*VW3 + 1][lid] = avec.s1;
+        tile[kl*VW3 + 2][lid] = avec.s2;
+        tile[kl*VW3 + 3][lid] = avec.s3;
+        tile[kl*VW3 + 4][lid] = avec.s4;
+        tile[kl*VW3 + 5][lid] = avec.s5;
+        tile[kl*VW3 + 6][lid] = avec.s6;
+        tile[kl*VW3 + 7][lid] = avec.s7;
+        tile[kl*VW3 + 8][lid] = avec.s8;
+        tile[kl*VW3 + 9][lid] = avec.s9;
+        tile[kl*VW3 + 10][lid] = avec.sA;
+        tile[kl*VW3 + 11][lid] = avec.sB;
+        tile[kl*VW3 + 12][lid] = avec.sC;
+        tile[kl*VW3 + 13][lid] = avec.sD;
+        tile[kl*VW3 + 14][lid] = avec.sE;
+        tile[kl*VW3 + 15][lid] = avec.sF;
       #endif
     }
 
@@ -272,11 +272,13 @@ __kernel void XgemvFastRot(const int m, const int n,
 
     // The multiply-add function (rotated)
     #pragma unroll
-    for (int kl=0; kl<WPT3; ++kl) {
-      const int k = kl * (WGS3/WPT3) + lid_div;
-      real aval = tile[k * WPT3 + lid_mod];
-      real xval = xlm[kl];
-      MultiplyAdd(acc, xval, aval);
+    for (int kl=0; kl<WPT3/VW3; ++kl) {
+      #pragma unroll
+      for (int v=0; v<VW3; ++v) {
+        real aval = tile[lid_mod*VW3 + v][lid_div * (WPT3/VW3) + kl];
+        real xval = xlm[kl*VW3 + v];
+        MultiplyAdd(acc, xval, aval);
+      }
     }
 
     // Synchronizes all threads in a workgroup
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index b69e4352..8446e4a9 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -61,7 +61,7 @@ class TuneXgemv {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
+    tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
     if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); }
     else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); }
     if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
@@ -69,10 +69,14 @@ class TuneXgemv {
 
   // Sets the constraints and local memory size
   static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
-    auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     if (V==2 || V==3) {
+      auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
       tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
     }
+    if (V==3) {
+      auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
+      tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+    }
   }
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
     if (V==1 || V==2) {
-- 
cgit v1.2.3


From 40a72259eba491631d8875aae465c5a93d7fed02 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 23 Jul 2016 16:58:11 +0200
Subject: Fixe a bug in the new XgemvFastRot kernel related to local memory
 size

---
 CHANGELOG                            |  1 +
 src/kernels/level2/xgemv_fast.opencl | 10 ++++++----
 src/tuning/kernels/xgemv.cpp         | 18 ++++++++++++++----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index b6e09102..d018e211 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,7 @@ Development version (next release)
 - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
 - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
+- Improved performance significantly of rotated GEMV computations
 - Added tuned parameters for various devices (see README)
 
 Version 0.8.0
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 359c3770..210c42c1 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -97,7 +97,7 @@ __kernel void XgemvFast(const int m, const int n,
                         const __global real* restrict xgm, const int x_offset, const int x_inc,
                         __global real* ygm, const int y_offset, const int y_inc,
                         const int do_conjugate, const int parameter,
-                        const int kl, const int ku) {
+                        const int kl_unused, const int ku_unused) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -199,7 +199,7 @@ __kernel void XgemvFastRot(const int m, const int n,
                            const __global real* restrict xgm, const int x_offset, const int x_inc,
                            __global real* ygm, const int y_offset, const int y_inc,
                            const int do_conjugate, const int parameter,
-                           const int kl, const int ku) {
+                           const int kl_unused, const int ku_unused) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
@@ -216,11 +216,13 @@ __kernel void XgemvFastRot(const int m, const int n,
   real acc;
   SetToZero(acc);
 
-  // Loops over work-group sized portions of the work
+  // Loops over tile-sized portions of the work
   for (int kwg=0; kwg<n; kwg+=WPT3) {
 
     // Loads the vector X into local memory
-    xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+    if (lid < WPT3) {
+      xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+    }
 
     // Loads the matrix A into local memory
     #pragma unroll
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 8446e4a9..96d4a5f2 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -61,10 +61,20 @@ class TuneXgemv {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
-    if (V==1 || V==2) { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4}); }
-    else { tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32}); }
-    if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
+    if (V==1) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+    }
+    if (V==2) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+      tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
+    }
+    if (V==3) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32});
+      tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
+    }
   }
 
   // Sets the constraints and local memory size
-- 
cgit v1.2.3


From 622682ffe30ae4e250d53a88eaacd899c905e20b Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 24 Jul 2016 16:41:01 +0200
Subject: Refactored the Python database script: separated functionality in
 modules, now complies to the PEP8 style, added proper command-line argument
 parsing, and cleaned-up

---
 scripts/database/database.py          | 356 +++++++---------------------------
 scripts/database/database/__init__.py |   0
 scripts/database/database/bests.py    |  20 ++
 scripts/database/database/clblast.py  | 132 +++++++++++++
 scripts/database/database/db.py       |  50 +++++
 scripts/database/database/defaults.py |  58 ++++++
 scripts/database/database/io.py       |  58 ++++++
 7 files changed, 389 insertions(+), 285 deletions(-)
 mode change 100644 => 100755 scripts/database/database.py
 create mode 100644 scripts/database/database/__init__.py
 create mode 100644 scripts/database/database/bests.py
 create mode 100644 scripts/database/database/clblast.py
 create mode 100644 scripts/database/database/db.py
 create mode 100644 scripts/database/database/defaults.py
 create mode 100644 scripts/database/database/io.py

diff --git a/scripts/database/database.py b/scripts/database/database.py
old mode 100644
new mode 100755
index a70b9fc1..e115d68c
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -1,326 +1,112 @@
 #!/usr/bin/env python
 
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
 #
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# ==================================================================================================
 
-# System modules
 import sys
 import os.path
 import glob
-import re
-import json
-try:
-	from urllib.request import urlopen # Python 3
-except ImportError:
-	from urllib2 import urlopen # Python 2
+import argparse
 
-# Additional modules
 import pandas as pd
-print("## Using pandas version "+pd.__version__+", requires at least 0.17.0")
+
+import database.io as io
+import database.db as db
+import database.clblast as clblast
+import database.bests as bests
+import database.defaults as defaults
 
 # Server storing a copy of the database
 DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
 
-# Constants
-VENDOR_DEFAULT = "default"
-DEVICETYPE_DEFAULT = "All"
-DEVICENAME_DEFAULT = "default"
-
-# Attributes
-DEVICETYPE_ATTRIBUTES = ["device_vendor", "device_type"]
-DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
-KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
-ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
-ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICETYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
-
 # OpenCL vendor names and their short name
-VENDOR_NAMES = { "device_vendor": {
+VENDOR_TRANSLATION_TABLE = {"device_vendor": {
   "GenuineIntel": "Intel",
   "Intel(R) Corporation": "Intel",
   "Advanced Micro Devices, Inc.": "AMD",
   "NVIDIA Corporation": "NVIDIA",
 }}
 
-# Pandas options
-pd.set_option('display.width', 1000)
-
-# ==================================================================================================
-# Database operations
-# ==================================================================================================
-
-# Downloads the database and save it to disk
-def DownloadDatabase(filename):
-	print("## Downloading database from '"+DATABASE_SERVER_URL+"'...")
-	df = urlopen(DATABASE_SERVER_URL)
-	output = open(file_db,'wb')
-	output.write(df.read())
-	output.close()
-
-# Loads the database from disk
-def LoadDatabase(filename):
-	return pd.read_pickle(filename)
-
-# Saves the database to disk
-def SaveDatabase(df, filename):
-	df.to_pickle(filename)
-
-# Loads JSON data from file
-def ImportDataFromFile(filename):
-	with open(filename) as f:
-		data = json.load(f)
-	json_data = pd.DataFrame(data)
-	df = pd.io.json.json_normalize(json_data["results"])
-	for attribute in ATTRIBUTES:
-		if attribute == "kernel_family":
-			df[attribute] = re.sub(r'_\d+', '', data[attribute])
-		elif attribute in data:
-			df[attribute] = data[attribute]
-		else:
-			df[attribute] = 0
-	return df
-
-# Returns the row-wise concatenation of two dataframes
-def ConcatenateData(df1, df2):
-	return pd.concat([df1, df2])
-
-# Removes duplicates from a dataframe
-def RemoveDuplicates(df):
-	return df.drop_duplicates()
-
-# database = database[(database["device"] != "AMD Radeon R9 M370X Compute Engine") | (database["kernel_family"] != "xgemm") | (database["precision"] != "32")]
-def RemoveEntriesByDevice(df, devicename):
-	return df[df["device"] != devicename]
-
-def RemoveEntriesByKernelFamily(df, familyname):
-	return df[df["kernel_family"] != familyname]
-
-def GetEntriesByField(df, field, value):
-	return df[df[field] == value]
-
-# Example usage:
-# df = UpdateDatabase(df, (df["kernel_family"] == "xdot") & (df["arg_n"] == "67108864"), "arg_n", "2097152")
-def UpdateDatabase(df, condition, field, value):
-	df.loc[condition, field] = value
-	return df
-
-# Fixes the problem that some vendors use multiple different names
-def SanitizeVendorNames(df):
-	df = df.replace(VENDOR_NAMES)
-	return df
-
-# Retrieves the results with the lowest execution times
-def GetBestResults(df):
-	dfbest = pd.DataFrame()
-	grouped = df.groupby(ATTRIBUTES+["kernel"])
-	for name, dfgroup in grouped:
-		besttime = dfgroup["time"].min()
-		bestcase = dfgroup[dfgroup["time"] == besttime].iloc[0]
-		dfbest = dfbest.append(bestcase, ignore_index=True)
-	return dfbest
-
-# Sets defaults for devices of the same type/vendor based on the smallest values of all know
-# entries. The average might be better for performance but some parameters might not be supported
-# on other devices.
-def CalculateDefaults(df):
-	dfdefault = pd.DataFrame()
-
-	# Defaults per type/vendor
-	groups = df.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
-	for name, dfgroup in groups:
-		default_values = dfgroup.min(axis=0)
-		default_values["device"] = DEVICENAME_DEFAULT
-		default_values["device_compute_units"] = 0
-		default_values["device_core_clock"] = 0
-		default_values["time"] = 0.0
-		dfdefault = dfdefault.append(default_values, ignore_index=True)
-	
-	# Checks for mis-matched arguments
-	groups = dfdefault.groupby(DEVICETYPE_ATTRIBUTES+KERNEL_ATTRIBUTES+["kernel"])
-	for name, dfgroup in groups:
-		if len(dfgroup) != 1:
-			description = dfgroup["kernel"].min() + " " + dfgroup["device_vendor"].min()
-			print("[WARNING] Entries for a single kernel with multiple argument values: " + description)
-			
-	# Defaults in general
-	groups = df.groupby(KERNEL_ATTRIBUTES+ARGUMENT_ATTRIBUTES+["kernel"])
-	for name, dfgroup in groups:
-		default_values = dfgroup.min(axis=0)
-		default_values["device_vendor"] = VENDOR_DEFAULT
-		default_values["device_type"] = DEVICETYPE_DEFAULT
-		default_values["device"] = DEVICENAME_DEFAULT
-		default_values["device_compute_units"] = 0
-		default_values["device_core_clock"] = 0
-		default_values["time"] = 0.0
-		dfdefault = dfdefault.append(default_values, ignore_index=True)
-	
-	# Database with both types of defaults only
-	return dfdefault
-
-# ==================================================================================================
-# C++ header generation
-# ==================================================================================================
-
-# The C++ header
-def GetHeader(family):
-	return("""
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Database generator <database.py>
-//
-// This file populates the database with best-found tuning parameters for the '%s' kernels.
-//
-// =================================================================================================
-
-namespace clblast {
-// ================================================================================================="""
-	% family.title())
-
-# The C++ footer
-def GetFooter():
-	return("\n} // namespace clblast\n")
-
-# The start of a new C++ precision entry
-def GetPrecision(family, precision):
-	precisionstring = ""
-	if precision == "16":
-		precisionstring = "Half"
-	elif precision == "32":
-		precisionstring = "Single"
-	elif precision == "64":
-		precisionstring = "Double"
-	elif precision == "3232":
-		precisionstring = "ComplexSingle"
-	elif precision == "6464":
-		precisionstring = "ComplexDouble"
-	else:
-		print("[ERROR] Unknown precision")
-		sys.exit()
-	return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n  \"%s\", Precision::k%s, {\n"
-	       % (family.title(), precisionstring, family.title(), precisionstring))
-
-# The C++ device type and vendor
-def GetDeviceVendor(vendor, devtype):
-	if vendor == VENDOR_DEFAULT and devtype == DEVICETYPE_DEFAULT:
-		return("    { // Default\n      kDeviceType%s, \"%s\", {\n" % (devtype, vendor))
-	return("    { // %s %ss\n      kDeviceType%s, \"%s\", {\n" % (vendor, devtype, devtype[0].upper() + devtype[1:], vendor))
-
-# Prints the data to a C++ database
-def PrintData(df, outputdir):
-
-	# Iterates over the kernel families: creates a new file per family
-	for family, dffamily in df.groupby(["kernel_family"]):
-		dffamily = dffamily.dropna(axis=1, how='all')
-		f = open(os.path.join(outputdir, family+'.hpp'), 'w+')
-		f.write(GetHeader(family))
-
-		# Loops over the different entries for this family and prints their headers
-		for precision, dfprecision in dffamily.groupby(["precision"]):
-			f.write(GetPrecision(family, precision))
-			for vendor, dfvendor in dfprecision.groupby(["device_vendor"]):
-				for devtype, dfdevtype in dfvendor.groupby(["device_type"]):
-					f.write(GetDeviceVendor(vendor, devtype))
-					for device, dfdevice in dfdevtype.groupby(["device"]):
-						devicename = "\"%s\"," % device
-						f.write("        { %-50s { " % devicename)
 
-						# Collects the paramaters for this case and prints them
-						parameters = []
-						for kernel, dfkernel in dfdevice.groupby(["kernel"]):
-							dfkernel = dfkernel.dropna(axis=1)
-							col_names = [col for col in list(dfkernel) if col.startswith('parameters.') and col != "parameters.PRECISION"]
-							parameters += ["{\"%s\",%d}" % (p.replace("parameters.",""), dfkernel[p].iloc[0]) for p in col_names]
-						f.write(", ".join(parameters))
-						f.write(" } },\n")
+def main(argv):
 
-					# Prints the footers
-					f.write("      }\n    },\n")
-			f.write("  }\n};\n\n// =================================================================================================")
-		f.write(GetFooter())
+    # Parses the command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("source_folder", help="The folder with JSON files to parse to add to the database")
+    parser.add_argument("clblast_root", help="Root of the CLBlast sources")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
 
-# ==================================================================================================
-# Command-line arguments parsing and verification
-# ==================================================================================================
+    # Parses the path arguments
+    database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.db")
+    json_files = os.path.join(cl_args.source_folder, "*.json")
+    cpp_database_path = os.path.join(cl_args.clblast_root, "src", "database", "kernels")
 
-# Checks for the number of command-line arguments
-if len(sys.argv) != 3:
-	print("[ERROR] Usage: database.py <folder_with_json_files> <root_of_clblast>")
-	sys.exit()
+    # Checks whether the command-line arguments are valid
+    clblast_header = os.path.join(cl_args.clblast_root, "include", "clblast.h")  # Not used but just for validation
+    if not os.path.isfile(clblast_header):
+        raise RuntimeError("The path '" + cl_args.clblast_root + "' does not point to the root of the CLBlast library")
+    if len(glob.glob(json_files)) < 1:
+        print("[database] The path '" + cl_args.source_folder + "' does not contain any JSON files")
 
-# Parses the command-line arguments
-path_json = sys.argv[1]
-path_clblast = sys.argv[2]
-file_db = os.path.join(path_clblast, "scripts", "database", "database.db")
-glob_json = os.path.join(path_json, "*.json")
+    # Pandas options
+    pd.set_option('display.width', 1000)
+    if cl_args.verbose:
+        print("[database] Using pandas version " + pd.__version__)
 
-# Checks whether the command-line arguments are valid; exists otherwise
-clblast_h = os.path.join(path_clblast, "include", "clblast.h") # Not used but just for validation
-if not os.path.isfile(clblast_h):
-	print("[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library")
-	sys.exit()
-if len(glob.glob(glob_json)) < 1:
-	print("## The path '"+path_json+"' does not contain any JSON files")
+    # Downloads the database if a local copy is not present
+    if not os.path.isfile(database_filename):
+        io.download_database(database_filename, DATABASE_SERVER_URL)
 
-# ==================================================================================================
-# The main body of the script
-# ==================================================================================================
+    # Loads the database from disk
+    database = io.load_database(database_filename)
 
-# Downloads the database if a local copy is not present
-db_exists = os.path.isfile(file_db)
-if not db_exists:
-	DownloadDatabase(file_db)
+    # Loops over all JSON files in the supplied folder
+    for file_json in glob.glob(json_files):
 
-# Loads the database from disk
-print("## Loading the database from disk...")
-database = LoadDatabase(file_db)
+        # Loads the newly imported data
+        sys.stdout.write("[database] Processing '"+file_json+"' ")  # No newline printed
+        imported_data = io.load_json_to_pandas(file_json)
 
-# Loops over all JSON files in the supplied folder
-for file_json in glob.glob(glob_json):
+        # Fixes the problem that some vendors use multiple different names
+        imported_data = db.find_and_replace(imported_data, VENDOR_TRANSLATION_TABLE)
 
-	# Loads the newly imported data
-	sys.stdout.write("## Processing '"+file_json+"' ")
-	imported_data = ImportDataFromFile(file_json)
-	imported_data = SanitizeVendorNames(imported_data)
+        # Adds the new data to the database
+        old_size = len(database.index)
+        database = db.concatenate_database(database, imported_data)
+        database = db.remove_duplicates(database)
+        new_size = len(database.index)
+        print("with " + str(new_size - old_size) + " new items")  # Newline printed here
 
-	# Adds the new data to the database
-	old_size = len(database.index)
-	database = ConcatenateData(database, imported_data)
-	database = RemoveDuplicates(database)
-	new_size = len(database.index)
-	print("with "+str(new_size-old_size)+" new items")
+    # Stores the modified database back to disk
+    if len(glob.glob(json_files)) >= 1:
+        io.save_database(database, database_filename)
 
-# Stores the modified database back to disk
-if len(glob.glob(glob_json)) >= 1:
-	print("## Storing the database to disk...")
-	SaveDatabase(database, file_db)
+    # Optional: update the database here. Default is disabled, code below is just an example
+    if False:  # TODO: Use command-line arguments to enable updates in a flexible way
+        database = db.update_database(database,
+                                      ((database["kernel"] == "CopyMatrixFast") &
+                                       (database["precision"] == "3232")),
+                                      "arg_alpha", "2+0.5i")
+        io.save_database(database, database_filename)
 
-# Optional: update the database here. Default is disabled, code below is just an example
-if False:
-	database = UpdateDatabase(database, ((database["kernel"] == "CopyMatrixFast") & (database["precision"] == "3232")), "arg_alpha", "2+0.5i")
-	SaveDatabase(database, file_db)
+    # Retrieves the best performing results
+    print("[database] Calculating the best results per device/kernel...")
+    database_best_results = bests.get_best_results(database)
 
-# Retrieves the best performing results
-print("## Calculating the best results per device/kernel...")
-bests = GetBestResults(database)
+    # Determines the defaults for other vendors and per vendor
+    database_defaults = defaults.calculate_defaults(database_best_results)
+    database_best_results = db.concatenate_database(database_best_results, database_defaults)
 
-# Determines the defaults for other vendors and per vendor
-defaults = CalculateDefaults(bests)
-bests = ConcatenateData(bests, defaults)
+    # Outputs the database as a C++ database
+    print("[database] Producing a C++ database in '" + cpp_database_path + "'...")
+    clblast.print_cpp_database(database_best_results, cpp_database_path)
 
-# Outputs the data as a C++ database
-path_cpp_database = os.path.join(path_clblast, "src", "database", "kernels")
-print("## Producing a C++ database in '"+path_cpp_database+"'...")
-PrintData(bests, path_cpp_database)
+    print("[database] All done")
 
-print("## All done")
 
-# ==================================================================================================
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/scripts/database/database/__init__.py b/scripts/database/database/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/database/database/bests.py b/scripts/database/database/bests.py
new file mode 100644
index 00000000..edb81733
--- /dev/null
+++ b/scripts/database/database/bests.py
@@ -0,0 +1,20 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import pandas as pd
+import clblast
+
+
+def get_best_results(df):
+    """Retrieves the results with the lowests execution times"""
+    database_bests = pd.DataFrame()
+    database_entries = df.groupby(clblast.ATTRIBUTES + ["kernel"])
+    for name, database_entry in database_entries:
+        best_time = database_entry["time"].min()
+        best_parameters = database_entry[database_entry["time"] == best_time].iloc[0]
+        database_bests = database_bests.append(best_parameters, ignore_index=True)
+    return database_bests
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
new file mode 100644
index 00000000..9c9f7eb4
--- /dev/null
+++ b/scripts/database/database/clblast.py
@@ -0,0 +1,132 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import os
+
+# Constants from the C++ code
+VENDOR_DEFAULT = "default"
+DEVICE_TYPE_DEFAULT = "All"
+DEVICE_NAME_DEFAULT = "default"
+
+# List of attributes
+DEVICE_TYPE_ATTRIBUTES = ["device_vendor", "device_type"]
+DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
+KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
+ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
+ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
+
+
+def precision_to_string(precision):
+    """Translates a precision number (represented as Python string) into a descriptive string"""
+    if precision == "16":
+        return "Half"
+    elif precision == "32":
+        return "Single"
+    elif precision == "64":
+        return "Double"
+    elif precision == "3232":
+        return "ComplexSingle"
+    elif precision == "6464":
+        return "ComplexDouble"
+    else:
+        raise("Unknown precision: " + precision)
+
+
+def get_cpp_separator():
+    """Retrieves a C++ comment separator"""
+    return "// ================================================================================================="
+
+
+def get_cpp_header(family):
+    """Retrieves the C++ header"""
+    return ("\n" + get_cpp_separator() + """
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the '%s' kernels.
+//\n"""
+            % family.title() + get_cpp_separator() + "\n\nnamespace clblast {\n" + get_cpp_separator())
+
+
+def get_cpp_footer():
+    """Retrieves the C++ footer"""
+    return "\n} // namespace clblast\n"
+
+
+def get_cpp_precision(family, precision):
+    """Retrieves the C++ code for the start of a new precision"""
+    precision_string = precision_to_string(precision)
+    return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n  \"%s\", Precision::k%s, {\n"
+           % (family.title(), precision_string, family.title(), precision_string))
+
+
+def get_cpp_device_vendor(vendor, device_type):
+    """Retrieves the C++ code for the (default) vendor and device type"""
+    if vendor == VENDOR_DEFAULT and device_type == DEVICE_TYPE_DEFAULT:
+        return "    { // Default\n      kDeviceType%s, \"%s\", {\n" % (device_type, vendor)
+    device_type_caps = device_type[0].upper() + device_type[1:]
+    return "    { // %s %ss\n      kDeviceType%s, \"%s\", {\n" % (vendor, device_type, device_type_caps, vendor)
+
+
+def print_cpp_database(database, output_dir):
+    """Outputs the database as C++ code"""
+
+    # Iterates over the kernel families
+    for family_name, family_database in database.groupby(["kernel_family"]):
+        family_database = family_database.dropna(axis=1, how='all')
+
+        # Opens a new file for each kernel family
+        full_path = os.path.join(output_dir, family_name+'.hpp')
+        with open(full_path, 'w+') as f:
+            f.write(get_cpp_header(family_name))
+
+            # Loops over the different precision (e.g. 16, 32, 3232, 64, 6464)
+            for precision, precision_database in family_database.groupby(["precision"]):
+                f.write(get_cpp_precision(family_name, precision))
+
+                # Loops over a combination of device vendors and device types (e.g. AMD GPU)
+                for vendor, vendor_database in precision_database.groupby(["device_vendor"]):
+                    for device_type, device_type_database in vendor_database.groupby(["device_type"]):
+                        f.write(get_cpp_device_vendor(vendor, device_type))
+
+                        # Loops over every device of this vendor-type combination
+                        for device_name, device_database in device_type_database.groupby(["device"]):
+                            device_name_quoted = "\"%s\"," % device_name
+                            device_name_cpp = "        { %-50s { " % device_name_quoted
+                            f.write(device_name_cpp)
+
+                            # Collects the parameters for this entry
+                            parameters = []
+                            for kernel, kernel_database in device_database.groupby(["kernel"]):
+                                kernel_database = kernel_database.dropna(axis=1)
+
+                                # Only consider the actual parameters, not the precision
+                                def is_parameter(column):
+                                    return column.startswith('parameters.') and column != "parameters.PRECISION"
+                                column_names = [col for col in list(kernel_database) if is_parameter(col)]
+
+                                for p in column_names:
+                                    parameter_name = p.replace("parameters.", "")
+                                    parameter_value = int(kernel_database[p].iloc[0])
+                                    parameters.append("{\"" + parameter_name + "\"," + str(parameter_value) + "}")
+
+                            # Prints the entry
+                            f.write(", ".join(parameters))
+                            f.write(" } },\n")
+
+                        # Prints the vendor-type combination footer
+                        f.write("      }\n    },\n")
+
+                # Prints the precision footer
+                f.write("  }\n};\n\n" + get_cpp_separator())
+
+            # Prints the file footer
+            f.write(get_cpp_footer())
diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py
new file mode 100644
index 00000000..60cfbcfa
--- /dev/null
+++ b/scripts/database/database/db.py
@@ -0,0 +1,50 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import pandas as pd
+
+
+def get_entries_by_field(database, field, value):
+    """Retrieves entries from the database with a specific value for a given field"""
+    return database[database[field] == value]
+
+
+def concatenate_database(database1, database2):
+    """Concatenates two databases row-wise and returns the result"""
+    return pd.concat([database1, database2])
+
+
+def remove_duplicates(database):
+    """Removes duplicates from a database"""
+    return database.drop_duplicates()
+
+
+def find_and_replace(database, dictionary):
+    """Finds and replaces entries in a database based on a dictionary. Example:
+    dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }"""
+    return database.replace(dictionary)
+
+
+def remove_entries_by_key_value(database, key, value):
+    """Removes entries in the databased which have a specific value for a given key"""
+    return database[database[key] != value]
+
+
+def remove_entries_by_device(database, device_name):
+    """Shorthand for the above, specifically removes entries for a given device"""
+    return remove_entries_by_key_value(database, "device", device_name)
+
+
+def remove_entries_by_kernel_family(database, kernel_family_name):
+    """Shorthand for the above, specifically removes entries for a given kernel family"""
+    return remove_entries_by_key_value(database, "kernel_family", kernel_family_name)
+
+
+def update_database(database, condition, field, value):
+    """Updates the database by writing a specific value to a given field, given certain conditions"""
+    database.loc[condition, field] = value
+    return database
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
new file mode 100644
index 00000000..357c3a3a
--- /dev/null
+++ b/scripts/database/database/defaults.py
@@ -0,0 +1,58 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import pandas as pd
+import clblast
+
+
+def set_default_device(database_entry):
+    """Sets the device name and parameters to some default values"""
+    database_entry["device"] = clblast.DEVICE_NAME_DEFAULT
+    database_entry["device_compute_units"] = 0
+    database_entry["device_core_clock"] = 0
+    return database_entry
+
+
+def set_default_time(database_entry):
+    """Sets the execution time to some default value"""
+    database_entry["time"] = 0.0
+    return database_entry
+
+
+def calculate_defaults(df):
+    """# Sets defaults for devices of the same type/vendor based on the smallest values of all known entries. The average
+    might be better for performance but some parameters might not be supported on other devices."""
+    database_defaults = pd.DataFrame()
+
+    # Defaults per combination of device vendors and device types (e.g. AMD GPU)
+    database_type_vendor = df.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] +
+                                      clblast.ARGUMENT_ATTRIBUTES)
+    for group_name, database_group in database_type_vendor:
+        default_values = database_group.min(axis=0)
+        default_values = set_default_device(default_values)
+        default_values = set_default_time(default_values)
+        database_defaults = database_defaults.append(default_values, ignore_index=True)
+
+    # Checks for mis-matched arguments
+    groups = database_defaults.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"])
+    for group_name, database_group in groups:
+        if len(database_group) != 1:
+            description = database_group["kernel"].min() + " " + database_group["device_vendor"].min()
+            print("[WARNING] Entries for a single kernel with multiple argument values: " + description)
+
+    # Defaults over all device types and vendors
+    groups = df.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES)
+    for group_name, database_group in groups:
+        default_values = database_group.min(axis=0)
+        default_values["device_vendor"] = clblast.VENDOR_DEFAULT
+        default_values["device_type"] = clblast.DEVICE_TYPE_DEFAULT
+        default_values = set_default_device(default_values)
+        default_values = set_default_time(default_values)
+        database_defaults = database_defaults.append(default_values, ignore_index=True)
+
+    # Database with both types of defaults only
+    return database_defaults
diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py
new file mode 100644
index 00000000..ad2f7ae9
--- /dev/null
+++ b/scripts/database/database/io.py
@@ -0,0 +1,58 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import re
+import json
+
+try:
+    from urllib.request import urlopen  # Python 3
+except ImportError:
+    from urllib2 import urlopen  # Python 2
+
+import pandas as pd
+
+import clblast
+
+
+def download_database(filename, database_url):
+    """Downloads a database and saves it to disk"""
+    print("[database] Downloading database from '" + database_url + "'...")
+    database = urlopen(database_url)
+    with open(filename, 'wb') as f:
+        f.write(database.read())
+
+
+def load_database(filename):
+    """Loads a database from disk"""
+    print("[database] Loading database from '" + filename + "'")
+    return pd.read_pickle(filename)
+
+
+def save_database(database, filename):
+    """Saves a database to disk"""
+    print("[database] Saving database to '" + filename + "'")
+    database.to_pickle(filename)
+
+
+def load_json_to_pandas(filename):
+    """Loads JSON data from file and converts it to a pandas database"""
+    with open(filename) as f:
+        json_data = json.load(f)
+
+    # Gathers all results and stores them in a new database
+    json_database = pd.DataFrame(json_data)
+    new_database = pd.io.json.json_normalize(json_database["results"])
+
+    # Sets the common attributes to each entry in the results
+    for attribute in clblast.ATTRIBUTES:
+        if attribute == "kernel_family":
+            new_database[attribute] = re.sub(r'_\d+', '', json_data[attribute])
+        elif attribute in json_data:
+            new_database[attribute] = json_data[attribute]
+        else:
+            new_database[attribute] = 0  # For example a parameters that was not used by this kernel
+    return new_database
-- 
cgit v1.2.3


From ffa35c623af4b0916f625f3a41000e75a1df7e1f Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 24 Jul 2016 17:00:21 +0200
Subject: Minor improvements after merging in groundwork for custom tuning
 parameters and kernels

---
 src/database/database.cpp | 12 ++++++++----
 src/database/database.hpp |  8 ++------
 src/routine.hpp           |  3 ++-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/database/database.cpp b/src/database/database.cpp
index ea1557b9..47f1da16 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -42,7 +42,8 @@ const std::vector<Database::DatabaseEntry> Database::database = {
 
 // =================================================================================================
 
-// Constructor, computing device properties and populating the parameter-vector from the database
+// Constructor, computing device properties and populating the parameter-vector from the database.
+// This takes an optional overlay database in case of custom tuning or custom kernels.
 Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
                    const Precision precision, const std::vector<DatabaseEntry> &overlay):
   parameters_{} {
@@ -66,7 +67,10 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
 
     for (auto db: { &overlay, &database }) {
       search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
-      if (search_result) { parameters_.insert(search_result->begin(), search_result->end()); break; }
+      if (search_result) {
+        parameters_.insert(search_result->begin(), search_result->end());
+        break;
+      }
     }
 
     if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
@@ -86,7 +90,7 @@ std::string Database::GetDefines() const {
 
 // =================================================================================================
 
-// Searches the database for the right kernel and precision
+// Searches a particular database for the right kernel and precision
 Database::ParametersPtr Database::Search(const std::string &this_kernel,
                                          const std::string &this_type,
                                          const std::string &this_vendor,
@@ -119,7 +123,7 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
     }
   }
 
-  // If we reached this point, something is wrong
+  // If we reached this point, the entry was not found in this database
   return nullptr;
 }
 
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 5a61fad9..e84357dc 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -79,7 +79,7 @@ class Database {
   static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
   static const std::vector<DatabaseEntry> database;
 
-  // The constructor with a user-provided database overlay
+  // The constructor with a user-provided database overlay (potentially an empty vector)
   explicit Database(const Queue &queue, const std::vector<std::string> &routines,
                     const Precision precision, const std::vector<DatabaseEntry> &overlay);
 
@@ -90,11 +90,7 @@ class Database {
   std::string GetDefines() const;
 
  private:
-  Parameters Search(const std::string &this_kernel, const std::string &this_type,
-                    const std::string &this_vendor, const std::string &this_device,
-                    const Precision this_precision) const;
-
-  // Alternate search method in a specified database, returning pointer (possibly NULL)
+  // Search method for a specified database, returning pointer (possibly a nullptr)
   ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
                        const std::string &this_vendor, const std::string &this_device,
                        const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
diff --git a/src/routine.hpp b/src/routine.hpp
index 21506e7b..f5c607af 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -32,7 +32,8 @@ namespace clblast {
 class Routine {
  public:
 
-  // Base class constructor
+  // Base class constructor. The user database is an optional extra database to override the
+  // built-in database.
   explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                    const std::vector<std::string> &routines, const Precision precision,
                    const std::vector<Database::DatabaseEntry> &userDatabase = {});
-- 
cgit v1.2.3


From 2582f0290a396305ee3b86fb544e999fd55fe323 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 25 Jul 2016 22:43:49 +0200
Subject: Moved the XgemvFast and XgemvFastRot tuning database into a separate
 file

---
 scripts/database/database/clblast.py    |   3 +-
 src/database/database.cpp               |   6 +-
 src/database/database.hpp               |   4 +-
 src/database/kernels/xgemm.hpp          |  12 --
 src/database/kernels/xgemv.hpp          | 200 +++++++++++++-------------
 src/database/kernels/xgemv_fast.hpp     | 247 ++++++++++++++++++++++++++++++++
 src/database/kernels/xgemv_fast_rot.hpp | 245 +++++++++++++++++++++++++++++++
 src/routines/level2/xgemv.cpp           |   2 +-
 src/tuning/kernels/xgemv.cpp            |   2 +-
 9 files changed, 604 insertions(+), 117 deletions(-)
 create mode 100644 src/database/kernels/xgemv_fast.hpp
 create mode 100644 src/database/kernels/xgemv_fast_rot.hpp

diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 9c9f7eb4..46b711cc 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -64,8 +64,9 @@ def get_cpp_footer():
 def get_cpp_precision(family, precision):
     """Retrieves the C++ code for the start of a new precision"""
     precision_string = precision_to_string(precision)
+    camelcase_name = family.title().replace("_", "")
     return("\n\nconst Database::DatabaseEntry Database::%s%s = {\n  \"%s\", Precision::k%s, {\n"
-           % (family.title(), precision_string, family.title(), precision_string))
+           % (camelcase_name, precision_string, camelcase_name, precision_string))
 
 
 def get_cpp_device_vendor(vendor, device_type):
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 47f1da16..28124455 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -17,6 +17,8 @@
 #include "database/kernels/xaxpy.hpp"
 #include "database/kernels/xdot.hpp"
 #include "database/kernels/xgemv.hpp"
+#include "database/kernels/xgemv_fast.hpp"
+#include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
 #include "database/kernels/xgemm.hpp"
 #include "database/kernels/copy.hpp"
@@ -32,8 +34,10 @@ const std::vector<Database::DatabaseEntry> Database::database = {
   XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
   XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
   XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
+  XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
+  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
   XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
-  XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+  /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
   CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
   PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
   TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
diff --git a/src/database/database.hpp b/src/database/database.hpp
index e84357dc..2fd96411 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -71,8 +71,10 @@ class Database {
   static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
   static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
   static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
+  static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
+  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
   static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
-  static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+  static const DatabaseEntry /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
   static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
   static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
   static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 736f2695..61b7ff05 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -14,18 +14,6 @@
 namespace clblast {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmHalf = {
-  "Xgemm", Precision::kHalf, {
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
 const Database::DatabaseEntry Database::XgemmSingle = {
   "Xgemm", Precision::kSingle, {
     { // AMD GPUs
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 3aa1863f..6d680b06 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -18,13 +18,13 @@ const Database::DatabaseEntry Database::XgemvHalf = {
   "Xgemv", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
   }
@@ -36,57 +36,57 @@ const Database::DatabaseEntry Database::XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
-        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",128}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Iris",                                            { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
-        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
+        { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
+        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
-        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
+        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
+        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
@@ -98,53 +98,53 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
   "Xgemv", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",64}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Iris",                                            { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
+        { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
+        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
-        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
@@ -156,47 +156,47 @@ const Database::DatabaseEntry Database::XgemvDouble = {
   "Xgemv", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",2} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
-        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
+        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
+        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
@@ -208,38 +208,38 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
   "Xgemv", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
new file mode 100644
index 00000000..65b15030
--- /dev/null
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -0,0 +1,247 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastHalf = {
+  "XgemvFast", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastSingle = {
+  "XgemvFast", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 750",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
+  "XgemvFast", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastDouble = {
+  "XgemvFast", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 750",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
+  "XgemvFast", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
new file mode 100644
index 00000000..ee866e26
--- /dev/null
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -0,0 +1,245 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+  "XgemvFastRot", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotSingle = {
+  "XgemvFastRot", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
+        { "Pitcairn",                                        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Tahiti",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "Iris",                                            { {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 1070",                                { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 480",                                 { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 670",                                 { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 680",                                 { {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750",                                 { {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
+        { "GeForce GTX 980",                                 { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tesla K20m",                                      { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Tesla K40m",                                      { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
+  "XgemvFastRot", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "Pitcairn",                                        { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "Tahiti",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "Iris",                                            { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Iris Pro",                                        { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 480",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 670",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 680",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotDouble = {
+  "XgemvFastRot", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
+        { "Pitcairn",                                        { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "Tahiti",                                          { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 1070",                                { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "GeForce GTX 480",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 670",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 680",                                 { {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
+        { "GeForce GTX 750",                                 { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
+        { "GeForce GTX 980",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Tesla K20m",                                      { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Tesla K40m",                                      { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
+  "XgemvFastRot", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
+        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Oland",                                           { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Pitcairn",                                        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Tahiti",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "GeForce GTX 480",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX 670",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index e4d407c8..4e32ba41 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,7 +22,7 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
+    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 96d4a5f2..7229602d 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -29,7 +29,7 @@ class TuneXgemv {
  public:
 
   // The representative kernel and the source code
-  static std::string KernelFamily() { return "xgemv_"+std::to_string(V); }
+  static std::string KernelFamily() { return (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); }
   static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); }
   static std::string GetSources() {
     return
-- 
cgit v1.2.3


From de1afe168d8da92d49d0239d8b5ff4385ae37326 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 25 Jul 2016 22:57:23 +0200
Subject: Removed all old tuning results for the XgemvFastRot kernel; re-added
 for a couple of devices

---
 src/database/database.cpp               |   2 +-
 src/database/database.hpp               |   2 +-
 src/database/kernels/xgemv_fast_rot.hpp | 167 ++++++--------------------------
 3 files changed, 32 insertions(+), 139 deletions(-)

diff --git a/src/database/database.cpp b/src/database/database.cpp
index 28124455..38974b95 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -35,7 +35,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
   XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
   XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
   XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
-  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
+  /* XgemvFastRotHalf, */ XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
   XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
   /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
   CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 2fd96411..8d6d3863 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -72,7 +72,7 @@ class Database {
   static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
   static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
   static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
-  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
+  static const DatabaseEntry /* XgemvFastRotHalf, */ XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
   static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
   static const DatabaseEntry /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
   static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index ee866e26..9822fb20 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -14,79 +14,36 @@
 namespace clblast {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotHalf = {
-  "XgemvFastRot", Precision::kHalf, {
-    { // Intel GPUs
-      kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-    { // Default
-      kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-  }
-};
-
-// =================================================================================================
-
 const Database::DatabaseEntry Database::XgemvFastRotSingle = {
   "XgemvFastRot", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Pitcairn",                                        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tahiti",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics 530",                        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Iris",                                            { {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
-        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 1070",                                { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 480",                                 { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 670",                                 { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 680",                                 { {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750",                                 { {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
-        { "GeForce GTX 980",                                 { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX TITAN X",                             { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Tesla K20m",                                      { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Tesla K40m",                                      { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",8} } },
       }
     },
   }
@@ -98,49 +55,26 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
   "XgemvFastRot", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Pitcairn",                                        { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "Tahiti",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics 530",                        { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Iris",                                            { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Iris Pro",                                        { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 670",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 680",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
   }
@@ -152,47 +86,25 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
   "XgemvFastRot", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Pitcairn",                                        { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "Tahiti",                                          { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 1070",                                { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 670",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 680",                                 { {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750",                                 { {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 750 Ti",                              { {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
-        { "GeForce GTX 980",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX TITAN X",                             { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Tesla K20m",                                      { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tesla K40m",                                      { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",8} } },
       }
     },
   }
@@ -204,38 +116,19 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
   "XgemvFastRot", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Oland",                                           { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Pitcairn",                                        { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tahiti",                                          { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-      }
-    },
-    { // Intel accelerators
-      kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-      }
-    },
-    { // NVIDIA GPUs
-      kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 670",                                 { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
   }
-- 
cgit v1.2.3


From 570cbcffa71e5142194921ae42a1541dc7c6e969 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sun, 24 Jul 2016 19:10:39 +0300
Subject: CMakeLists.txt: provide a find_package() config for dependent
 projects

---
 CMakeLists.txt | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 95d1d500..42a36732 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,11 +127,6 @@ endif()
 
 # ==================================================================================================
 
-# Includes directories: CLBlast and OpenCL
-include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
-
-# ==================================================================================================
-
 # Sets the supported routines and the used kernels. New routines and kernels should be added here.
 set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger xgemm xgemv)
 set(SAMPLE_PROGRAMS_CPP sgemm)
@@ -173,17 +168,27 @@ endforeach()
 add_library(clblast SHARED ${SOURCES})
 target_link_libraries(clblast ${OPENCL_LIBRARIES})
 
+# Includes directories: CLBlast and OpenCL
+target_include_directories(clblast PUBLIC
+                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/include>
+                           $<BUILD_INTERFACE:${clblast_SOURCE_DIR}/src>
+                           $<INSTALL_INTERFACE:include>
+                           ${OPENCL_INCLUDE_DIRS})
+
 # Sets the proper __declspec(dllexport) keyword for Visual Studio when the library is built
 if(MSVC)
   target_compile_definitions(clblast PRIVATE COMPILING_DLL=1) # requires at least CMake 2.8.11
 endif()
 
 # Installs the library
-install(TARGETS clblast DESTINATION lib)
+install(TARGETS clblast EXPORT CLBlast DESTINATION lib)
 install(FILES include/clblast.h DESTINATION include)
 install(FILES include/clblast_c.h DESTINATION include)
 install(FILES include/clblast_half.h DESTINATION include)
 
+# Installs the config for find_package in dependent projects
+install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
+
 # ==================================================================================================
 
 # Sets a default platform ($DEVICEPLATFORM) and device ($DEFAULT_DEVICE) to run tuners and tests on
@@ -197,6 +202,11 @@ endif()
 
 # ==================================================================================================
 
+# Includes directories: CLBlast and OpenCL
+include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
+
+# ==================================================================================================
+
 # This section contains all the code related to the examples
 if(SAMPLES)
 
-- 
cgit v1.2.3


From b5d7b583937cecb2c8c58b676db8617ea93fb474 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Thu, 28 Jul 2016 17:53:54 +0300
Subject: CMakeLists.txt: use target_include_directories()

---
 CMakeLists.txt | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 42a36732..231b8e95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -202,11 +202,6 @@ endif()
 
 # ==================================================================================================
 
-# Includes directories: CLBlast and OpenCL
-include_directories(${clblast_SOURCE_DIR}/include ${clblast_SOURCE_DIR}/src ${OPENCL_INCLUDE_DIRS})
-
-# ==================================================================================================
-
 # This section contains all the code related to the examples
 if(SAMPLES)
 
@@ -236,7 +231,6 @@ endif()
 if(TUNERS)
 
   # Includes CLTune
-  include_directories(${CLTUNE_INCLUDE_DIRS})
 
   # Visual Studio requires the sources of non-exported objects/libraries
   set(TUNERS_COMMON )
@@ -248,6 +242,7 @@ if(TUNERS)
   foreach(KERNEL ${KERNELS})
     add_executable(clblast_tuner_${KERNEL} ${TUNERS_COMMON} src/tuning/kernels/${KERNEL}.cpp)
     target_link_libraries(clblast_tuner_${KERNEL} clblast ${CLTUNE_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_include_directories(clblast_tuner_${KERNEL} PUBLIC ${CLTUNE_INCLUDE_DIRS})
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
 
@@ -291,9 +286,6 @@ if(CLIENTS OR TESTS)
     endif()
   endif()
 
-  # Sets the include directories
-  include_directories(${clblast_SOURCE_DIR} ${REF_INCLUDES})
-
 endif()
 
 # ==================================================================================================
@@ -309,6 +301,10 @@ if(CLIENTS)
   else()
     # Creates the common performance-tests objects (requires CMake 2.8.8)
     add_library(test_performance_common OBJECT test/performance/client.cpp)
+    # Adds clblast's interface include pathes because we can't link to clblast here
+    target_include_directories(test_performance_common PRIVATE
+                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
+                               ${clblast_SOURCE_DIR})
     set(CLIENTS_COMMON ${CLIENTS_COMMON} $<TARGET_OBJECTS:test_performance_common>)
   endif()
 
@@ -331,6 +327,7 @@ if(CLIENTS)
   endforeach()
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
+    target_include_directories(clblast_client_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
     install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
   endforeach()
 
@@ -352,6 +349,9 @@ if(TESTS)
     # Creates the common correctness-tests objects (requires CMake 2.8.8)
     add_library(test_correctness_common OBJECT
                 test/correctness/tester.cpp test/correctness/testblas.cpp)
+    target_include_directories(test_correctness_common PUBLIC
+                               $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
+                               ${clblast_SOURCE_DIR})
     set(TESTS_COMMON ${TESTS_COMMON} $<TARGET_OBJECTS:test_correctness_common>)
   endif()
 
@@ -375,6 +375,7 @@ if(TESTS)
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
+    target_include_directories(clblast_test_${ROUTINE} PUBLIC ${clblast_SOURCE_DIR} ${REF_INCLUDES})
     add_test(clblast_test_${ROUTINE} clblast_test_${ROUTINE} ${DEVICEPLATFORM})
   endforeach()
 
-- 
cgit v1.2.3


From 6c11fdc12ceaebbe79fec0d7a954d28884d41965 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Thu, 28 Jul 2016 18:03:35 +0300
Subject: .travis.yml: use OpenCL ICD Loader and headers shipped by distro

Using our own headers causes problems with CMake which does not like having
OpenCL header path inside of the source tree. While at it, use distro's
universal OpenCL loader as well.
---
 .travis.yml | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 8e1a80db..0465afa4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,49 +17,21 @@ addons:
       - kubuntu-backports
     packages:
       - cmake
+      - ocl-icd-opencl-dev
 
 env:
   global:
     - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
-    - OPENCL_REGISTRY=https://www.khronos.org/registry/cl
-    - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl
 
 before_install:
   - cmake --version;
   - ${CC} --version;
   - ${CXX} --version;
 
-install:
-  # The following linux logic is necessary because of Travis's move to the GCE platform, which does not
-  # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221
-  # We build our own linkable .so file
-  - if [ ${TRAVIS_OS_NAME} == "linux" ]; then
-      mkdir -p ${OPENCL_ROOT};
-      pushd ${OPENCL_ROOT};
-      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-ICD-Loader.git;
-      mv ./OpenCL-ICD-Loader/* .;
-      travis_retry git clone --depth 1 https://github.com/KhronosGroup/OpenCL-Headers.git inc/CL;
-      pushd inc/CL;
-      travis_retry wget -w 1 -np -nd -nv -A h,hpp ${OPENCL_REGISTRY}/api/2.1/cl.hpp;
-      popd;
-      mkdir -p lib;
-      pushd lib;
-      cmake -G "Unix Makefiles" ..;
-      make;
-      cp ./bin/libOpenCL.so .;
-      popd;
-      pushd inc/CL;
-      travis_retry git fetch origin opencl12:opencl12;
-      git checkout opencl12;
-      popd;
-      mv inc/ include/;
-      popd;
-    fi
-
 before_script:
   - mkdir -p ${CLBLAST_ROOT}
   - pushd ${CLBLAST_ROOT}
-  - cmake -DOPENCL_ROOT=${OPENCL_ROOT} -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR}
+  - cmake -DTESTS=ON -DCLIENTS=ON ${TRAVIS_BUILD_DIR}
 
 script:
   - make
-- 
cgit v1.2.3


From 227374deba218fb03a802d81febf04bbd207167a Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Thu, 28 Jul 2016 18:15:56 +0300
Subject: .appveyor.yml: move {OPENCL,CLBLAST}_ROOT out of source tree

Reasoning is the same as in previous commit: CMake does not like having
OpenCL header path inside of the source tree. CLBLAST_ROOT is moved for
uniformity.
---
 .appveyor.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index 8597e43e..eb7f1c97 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1,8 +1,8 @@
 environment:
   global:
-    CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\clblast"
+    CLBLAST_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\clblast"
     OPENCL_REGISTRY: "https://www.khronos.org/registry/cl"
-    OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl"
+    OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\..\\bin\\opencl"
 
 platform:
   - x64
-- 
cgit v1.2.3


From 35623cd98d798173661c8d86c84fb48261a2493e Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Thu, 28 Jul 2016 20:45:09 +0200
Subject: Minor update regarding the previous CMake export/install target
 changes

---
 CHANGELOG      | 1 +
 CMakeLists.txt | 5 ++---
 README.md      | 1 +
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index d018e211..8fce1969 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,7 @@ Development version (next release)
 - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
 - Improved performance significantly of rotated GEMV computations
+- Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
 
 Version 0.8.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 231b8e95..7393c6e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -230,8 +230,6 @@ endif()
 # the CLTune library (not included as part of the source).
 if(TUNERS)
 
-  # Includes CLTune
-
   # Visual Studio requires the sources of non-exported objects/libraries
   set(TUNERS_COMMON )
   if(MSVC)
@@ -301,7 +299,8 @@ if(CLIENTS)
   else()
     # Creates the common performance-tests objects (requires CMake 2.8.8)
     add_library(test_performance_common OBJECT test/performance/client.cpp)
-    # Adds clblast's interface include pathes because we can't link to clblast here
+
+    # Adds CLBlast's interface include paths because we can't link to CLBlast here
     target_include_directories(test_performance_common PRIVATE
                                $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES>
                                ${clblast_SOURCE_DIR})
diff --git a/README.md b/README.md
index 1b3adcb9..d9d30cb1 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,7 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Marco Hutter](https://github.com/gpus)
 * [Hugh Perkins](https://github.com/hughperkins)
 * [Gian-Carlo Pascutto](https://github.com/gcp)
+* [Ivan Shapovalov](https://github.com/intelfx)
 
 Tuning and testing on a variety of OpenCL devices was made possible by:
 
-- 
cgit v1.2.3


From 3f5401d4c8947945c4770fb1dfd354892702195f Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 7 Aug 2016 16:25:38 +0200
Subject: Added a first version of the database's common-best default
 calculation

---
 scripts/database/database.py          |  3 +-
 scripts/database/database/defaults.py | 70 +++++++++++++++++++++++++++++++----
 2 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/scripts/database/database.py b/scripts/database/database.py
index e115d68c..0ce89d03 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -98,7 +98,8 @@ def main(argv):
     database_best_results = bests.get_best_results(database)
 
     # Determines the defaults for other vendors and per vendor
-    database_defaults = defaults.calculate_defaults(database_best_results)
+    print("[database] Calculating the default values...")
+    database_defaults = defaults.calculate_defaults(database)
     database_best_results = db.concatenate_database(database_best_results, database_defaults)
 
     # Outputs the database as a C++ database
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 357c3a3a..fca793ea 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -6,7 +6,9 @@
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
 import pandas as pd
+
 import clblast
+import bests
 
 
 def set_default_device(database_entry):
@@ -23,16 +25,18 @@ def set_default_time(database_entry):
     return database_entry
 
 
-def calculate_defaults(df):
-    """# Sets defaults for devices of the same type/vendor based on the smallest values of all known entries. The average
-    might be better for performance but some parameters might not be supported on other devices."""
+def calculate_defaults(database, calculate_common_best=True):
+    """Sets defaults for devices of the same type/vendor. An option determines how to compute the defaults."""
     database_defaults = pd.DataFrame()
 
     # Defaults per combination of device vendors and device types (e.g. AMD GPU)
-    database_type_vendor = df.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] +
-                                      clblast.ARGUMENT_ATTRIBUTES)
+    database_type_vendor = database.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] +
+                                            clblast.ARGUMENT_ATTRIBUTES)
     for group_name, database_group in database_type_vendor:
-        default_values = database_group.min(axis=0)
+        if calculate_common_best:
+            default_values = get_common_best(database_group, group_name)
+        else:
+            default_values = get_smallest_best(database_group)
         default_values = set_default_device(default_values)
         default_values = set_default_time(default_values)
         database_defaults = database_defaults.append(default_values, ignore_index=True)
@@ -45,9 +49,9 @@ def calculate_defaults(df):
             print("[WARNING] Entries for a single kernel with multiple argument values: " + description)
 
     # Defaults over all device types and vendors
-    groups = df.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES)
+    groups = database.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES)
     for group_name, database_group in groups:
-        default_values = database_group.min(axis=0)
+        default_values = get_smallest_best(database_group)
         default_values["device_vendor"] = clblast.VENDOR_DEFAULT
         default_values["device_type"] = clblast.DEVICE_TYPE_DEFAULT
         default_values = set_default_device(default_values)
@@ -56,3 +60,53 @@ def calculate_defaults(df):
 
     # Database with both types of defaults only
     return database_defaults
+
+
+def get_smallest_best(database):
+    """Sets defaults based on the smallest values of all known entries. The average might be better for performance but
+    some parameters might not be supported on other devices."""
+    database_best_results = bests.get_best_results(database)
+    return database_best_results.min(axis=0)
+
+
+def get_common_best(database, group_name):
+    """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case
+    not every device was tuned with the same parameters."""
+    # TODO: Quite a bit slower than the above `get_smallest_best` method
+
+    # Counts the number of devices in this group
+    num_devices = len(database.groupby(clblast.DEVICE_ATTRIBUTES))
+
+    # Removes columns without any values
+    database = database.dropna(axis=1, how='all')
+
+    # Retrieves the parameter names for this kernel
+    all_column_names = list(database.columns.values)
+    parameter_column_names = [c for c in all_column_names if "parameters." in c]
+
+    # Removes entries which are not available for all devices
+    database_common = pd.DataFrame()
+    database_by_parameters = database.groupby(parameter_column_names)
+    for parameter_values, database_parameters in database_by_parameters:
+        num_entries = database_parameters.shape[0]
+        if num_entries == num_devices:
+            database_common = database_common.append(database_parameters)
+
+    # Fall back to another method in case there are no shared entries at all across devices
+    if database_common.shape[0] == 0:
+        # print("Skipping: " + str(group_name) + " with devices: " + str(num_devices) + " " + str(database.shape[0]))
+        return get_smallest_best(database)
+
+    # Computes the sum of the execution times over the different devices
+    database_common['time'] = database_common.groupby(parameter_column_names)['time'].transform(sum)
+
+    # Retrieves the entries with the best execution time
+    best_time = database_common["time"].min()
+    database_bests = database_common[database_common["time"] == best_time]
+
+    # Retrieves one example only (the parameters are the same anyway)
+    database_bests = database_bests.drop_duplicates(["time"])
+    # print(str(group_name) + " with num devices: " + str(num_devices) + " " + str(database_bests.shape))
+    assert database_bests.shape[0] == 1
+
+    return database_bests
-- 
cgit v1.2.3


From 7da6492b36cae7ba8859cd6d6ab3250e11f9a2b8 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 9 Aug 2016 21:06:04 +0200
Subject: Improved the speed of the new common-best defaults method for the
 database generation

---
 scripts/database/database/defaults.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index fca793ea..48693247 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -85,28 +85,26 @@ def get_common_best(database, group_name):
     parameter_column_names = [c for c in all_column_names if "parameters." in c]
 
     # Removes entries which are not available for all devices
-    database_common = pd.DataFrame()
     database_by_parameters = database.groupby(parameter_column_names)
-    for parameter_values, database_parameters in database_by_parameters:
-        num_entries = database_parameters.shape[0]
-        if num_entries == num_devices:
-            database_common = database_common.append(database_parameters)
+    database_common = database_by_parameters.filter(lambda x: len(x) == num_devices)
 
     # Fall back to another method in case there are no shared entries at all across devices
-    if database_common.shape[0] == 0:
-        # print("Skipping: " + str(group_name) + " with devices: " + str(num_devices) + " " + str(database.shape[0]))
+    if len(database_common) == 0:
+        # print("[database] Skipping: " + str(group_name) + " with devices: %d %d " % (num_devices, len(database)))
         return get_smallest_best(database)
 
     # Computes the sum of the execution times over the different devices
-    database_common['time'] = database_common.groupby(parameter_column_names)['time'].transform(sum)
+    database_common_by_parameters = database_common.groupby(parameter_column_names)
+    group_times = database_common_by_parameters['time'].transform(sum)
+    database_common.loc[:, 'group_time'] = group_times
 
     # Retrieves the entries with the best execution time
-    best_time = database_common["time"].min()
-    database_bests = database_common[database_common["time"] == best_time]
+    best_time = database_common["group_time"].min()
+    database_bests = database_common[database_common["group_time"] == best_time]
 
     # Retrieves one example only (the parameters are the same anyway)
-    database_bests = database_bests.drop_duplicates(["time"])
-    # print(str(group_name) + " with num devices: " + str(num_devices) + " " + str(database_bests.shape))
-    assert database_bests.shape[0] == 1
+    database_bests = database_bests.drop_duplicates(["group_time"])
+    # print("[database] " + str(group_name) + " with devices: " + str(num_devices) + " " + str(database_bests.shape))
+    assert len(database_bests) == 1
 
     return database_bests
-- 
cgit v1.2.3


From 7d5631b7e4bb011725b512b55fefa1fa8165a8dd Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 15 Aug 2016 21:01:07 +0200
Subject: Updated the database script to calculate the relative best
 performance of tuning results common for a device/vendor type

---
 scripts/database/database.py            |  2 +-
 scripts/database/database/defaults.py   | 39 +++++++++++++++++++++------------
 src/database/kernels/copy.hpp           | 22 +++++++++----------
 src/database/kernels/pad.hpp            | 28 +++++++++++------------
 src/database/kernels/padtranspose.hpp   | 22 +++++++++----------
 src/database/kernels/transpose.hpp      | 26 +++++++++++-----------
 src/database/kernels/xaxpy.hpp          | 22 +++++++++----------
 src/database/kernels/xdot.hpp           | 18 +++++++--------
 src/database/kernels/xgemv.hpp          | 14 ++++++------
 src/database/kernels/xgemv_fast.hpp     | 10 ++++-----
 src/database/kernels/xgemv_fast_rot.hpp |  4 ++--
 src/database/kernels/xger.hpp           | 28 +++++++++++------------
 12 files changed, 123 insertions(+), 112 deletions(-)

diff --git a/scripts/database/database.py b/scripts/database/database.py
index 0ce89d03..6d370d99 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -99,7 +99,7 @@ def main(argv):
 
     # Determines the defaults for other vendors and per vendor
     print("[database] Calculating the default values...")
-    database_defaults = defaults.calculate_defaults(database)
+    database_defaults = defaults.calculate_defaults(database, cl_args.verbose)
     database_best_results = db.concatenate_database(database_best_results, database_defaults)
 
     # Outputs the database as a C++ database
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 48693247..985f24bd 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -25,7 +25,7 @@ def set_default_time(database_entry):
     return database_entry
 
 
-def calculate_defaults(database, calculate_common_best=True):
+def calculate_defaults(database, verbose, calculate_common_best=True):
     """Sets defaults for devices of the same type/vendor. An option determines how to compute the defaults."""
     database_defaults = pd.DataFrame()
 
@@ -34,7 +34,7 @@ def calculate_defaults(database, calculate_common_best=True):
                                             clblast.ARGUMENT_ATTRIBUTES)
     for group_name, database_group in database_type_vendor:
         if calculate_common_best:
-            default_values = get_common_best(database_group, group_name)
+            default_values = get_common_best(database_group, group_name, verbose)
         else:
             default_values = get_smallest_best(database_group)
         default_values = set_default_device(default_values)
@@ -69,16 +69,23 @@ def get_smallest_best(database):
     return database_best_results.min(axis=0)
 
 
-def get_common_best(database, group_name):
+def get_common_best(database, group_name, verbose):
     """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case
-    not every device was tuned with the same parameters."""
-    # TODO: Quite a bit slower than the above `get_smallest_best` method
+    not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve
+    the smallest best execution time"""
 
     # Counts the number of devices in this group
     num_devices = len(database.groupby(clblast.DEVICE_ATTRIBUTES))
 
     # Removes columns without any values
     database = database.dropna(axis=1, how='all')
+    database = database.reset_index()
+
+    # Inserts the relative execution times into the database
+    def relative_performance(x):
+        x["relative_performance"] = x["time"].min() / x["time"]
+        return x
+    database = database.groupby(clblast.ATTRIBUTES + ["kernel"]).apply(relative_performance)
 
     # Retrieves the parameter names for this kernel
     all_column_names = list(database.columns.values)
@@ -94,17 +101,21 @@ def get_common_best(database, group_name):
         return get_smallest_best(database)
 
     # Computes the sum of the execution times over the different devices
-    database_common_by_parameters = database_common.groupby(parameter_column_names)
-    group_times = database_common_by_parameters['time'].transform(sum)
-    database_common.loc[:, 'group_time'] = group_times
+    def sum_performance(x):
+        x["group_performance"] = x["relative_performance"].sum()
+        return x
+    database_common = database_common.groupby(parameter_column_names).apply(sum_performance)
 
-    # Retrieves the entries with the best execution time
-    best_time = database_common["group_time"].min()
-    database_bests = database_common[database_common["group_time"] == best_time]
+    # Retrieves the entries with the highest performance
+    best_performance = database_common["group_performance"].max()
+    database_bests = database_common[database_common["group_performance"] == best_performance]
 
     # Retrieves one example only (the parameters are the same anyway)
-    database_bests = database_bests.drop_duplicates(["group_time"])
-    # print("[database] " + str(group_name) + " with devices: " + str(num_devices) + " " + str(database_bests.shape))
-    assert len(database_bests) == 1
+    database_bests = database_bests.drop_duplicates(["group_performance"])
 
+    # Completed, report and return the results
+    if verbose:
+        print("[database] " + str(group_name) + " with performance " + str(best_performance) + " with devices: " +
+        str(num_devices) + " " + str(database_bests.shape))
+    assert len(database_bests) == 1
     return database_bests
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index d592f110..e9902293 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -41,7 +41,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -65,7 +65,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Intel CPUs
@@ -118,7 +118,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel GPUs
@@ -149,7 +149,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -171,7 +171,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
         { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -185,7 +185,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -208,7 +208,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -230,7 +230,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
         { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -244,7 +244,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index cd034f15..a242a827 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -41,7 +41,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // ARM GPUs
@@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel GPUs
@@ -65,7 +65,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel accelerators
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // ARM GPUs
@@ -124,7 +124,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel GPUs
@@ -134,7 +134,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
       }
     },
     { // Intel accelerators
@@ -157,7 +157,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -179,7 +179,7 @@ const Database::DatabaseEntry Database::PadDouble = {
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // ARM GPUs
@@ -193,7 +193,7 @@ const Database::DatabaseEntry Database::PadDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -216,7 +216,7 @@ const Database::DatabaseEntry Database::PadDouble = {
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -238,7 +238,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // ARM GPUs
@@ -252,7 +252,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -275,7 +275,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index c2034c3e..0f63eafa 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
       }
     },
     { // Intel GPUs
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
       }
     },
     { // Default
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -134,7 +134,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
     { // Intel accelerators
@@ -157,7 +157,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
       }
     },
     { // Default
@@ -179,7 +179,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -193,7 +193,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -216,7 +216,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
       }
     },
     { // Default
@@ -238,7 +238,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -252,7 +252,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
-        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -275,7 +275,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 8e852c4b..d12d28f0 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -41,7 +41,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
         { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -65,7 +65,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // Default
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -124,7 +124,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
     },
     { // Intel GPUs
@@ -134,7 +134,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
       }
     },
     { // NVIDIA GPUs
@@ -151,7 +151,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
     { // Default
@@ -173,7 +173,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -187,7 +187,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
     },
     { // Intel accelerators
@@ -210,7 +210,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
     { // Default
@@ -232,7 +232,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
         { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -246,7 +246,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // NVIDIA GPUs
@@ -263,7 +263,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 905ee084..2f1dd638 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -41,7 +41,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -55,7 +55,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // Intel GPUs
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
       }
     },
     { // Default
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -124,7 +124,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
       }
     },
     { // Intel GPUs
@@ -134,7 +134,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",8} } },
       }
     },
     { // Intel accelerators
@@ -157,7 +157,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // Default
@@ -193,7 +193,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -216,7 +216,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // Default
@@ -238,7 +238,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
         { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -252,7 +252,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",8}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",256}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index e36dd8ca..394c25de 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -41,7 +41,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -56,7 +56,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
       }
     },
     { // NVIDIA GPUs
@@ -70,7 +70,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",256} } },
       }
     },
     { // Default
@@ -92,7 +92,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -121,7 +121,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
       }
     },
     { // Default
@@ -143,7 +143,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -163,7 +163,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
     { // Default
@@ -185,7 +185,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -205,7 +205,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 6d680b06..ba71893e 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -48,7 +48,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
     { // Intel GPUs
@@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
         { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
       }
     },
     { // Default
@@ -161,14 +161,14 @@ const Database::DatabaseEntry Database::XgemvDouble = {
         { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
     { // Intel accelerators
@@ -191,7 +191,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
         { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
         { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
         { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Default
@@ -220,7 +220,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
     { // Intel accelerators
@@ -234,7 +234,7 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
         { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 65b15030..7e948540 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -48,7 +48,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
       }
     },
     { // Intel GPUs
@@ -58,7 +58,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
     { // Intel accelerators
@@ -81,7 +81,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
       }
     },
     { // Default
@@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
     { // Intel accelerators
@@ -189,7 +189,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 9822fb20..a57dcc66 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -32,7 +32,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
       }
     },
     { // NVIDIA GPUs
@@ -69,7 +69,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
       }
     },
     { // Default
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 216925fc..06f65c46 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -41,7 +41,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -54,7 +54,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // Intel GPUs
@@ -63,7 +63,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",4} } },
       }
     },
     { // NVIDIA GPUs
@@ -75,7 +75,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // Default
@@ -97,7 +97,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",4}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -110,7 +110,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // Intel GPUs
@@ -119,7 +119,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
@@ -131,7 +131,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
@@ -153,7 +153,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
         { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -166,7 +166,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // NVIDIA GPUs
@@ -178,7 +178,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
@@ -200,7 +200,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
         { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -213,7 +213,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // NVIDIA GPUs
@@ -225,7 +225,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
       }
     },
     { // Default
-- 
cgit v1.2.3


From 57f1aa76857cf0566e05b43b9b2a98a3a6139c8b Mon Sep 17 00:00:00 2001
From: "D. Van Assche" <dimitri.vanassche@macq.eu>
Date: Thu, 18 Aug 2016 17:33:13 +0200
Subject: Adapt opencl files for 1.1 OpenCL

In OpenCL 1.1 __kernel has to be before __attribute__, at least with
Vivante compiler.
---
 src/kernels/level1/xamax.opencl              |  8 ++++----
 src/kernels/level1/xasum.opencl              |  8 ++++----
 src/kernels/level1/xaxpy.opencl              |  8 ++++----
 src/kernels/level1/xcopy.opencl              |  8 ++++----
 src/kernels/level1/xdot.opencl               |  8 ++++----
 src/kernels/level1/xnrm2.opencl              |  8 ++++----
 src/kernels/level1/xscal.opencl              |  8 ++++----
 src/kernels/level1/xswap.opencl              |  8 ++++----
 src/kernels/level2/xgemv.opencl              |  4 ++--
 src/kernels/level2/xgemv_fast.opencl         |  8 ++++----
 src/kernels/level2/xger.opencl               |  4 ++--
 src/kernels/level2/xher.opencl               |  4 ++--
 src/kernels/level2/xher2.opencl              |  4 ++--
 src/kernels/level3/convert_hermitian.opencl  |  8 ++++----
 src/kernels/level3/convert_symmetric.opencl  |  8 ++++----
 src/kernels/level3/convert_triangular.opencl |  8 ++++----
 src/kernels/level3/copy_fast.opencl          |  4 ++--
 src/kernels/level3/copy_pad.opencl           |  8 ++++----
 src/kernels/level3/transpose_fast.opencl     |  4 ++--
 src/kernels/level3/transpose_pad.opencl      |  8 ++++----
 src/kernels/level3/xgemm_part2.opencl        | 12 ++++++------
 21 files changed, 74 insertions(+), 74 deletions(-)

diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 48d0eb5c..763c1c0e 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,8 +30,8 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xamax(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xamax(const int n,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global singlereal* maxgm, __global unsigned int* imaxgm) {
   __local singlereal maxlm[WGS1];
@@ -95,8 +95,8 @@ __kernel void Xamax(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XamaxEpilogue(const __global singlereal* restrict maxgm,
                             const __global unsigned int* restrict imaxgm,
                             __global unsigned int* imax, const int imax_offset) {
   __local singlereal maxlm[WGS2];
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 58d0f11b..1542a187 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,8 +30,8 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xasum(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xasum(const int n,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* output) {
   __local real lm[WGS1];
@@ -74,8 +74,8 @@ __kernel void Xasum(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XasumEpilogue(const __global real* restrict input,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XasumEpilogue(const __global real* restrict input,
                             __global real* asum, const int asum_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index e0efadc1..73a4a535 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,8 +22,8 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xaxpy(const int n, const __constant real* restrict arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
   const real alpha = arg_alpha[0];
@@ -40,8 +40,8 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
                         const __global realV* restrict xgm,
                         __global realV* ygm) {
   const real alpha = arg_alpha[0];
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 97c27ccf..c96c33ac 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,8 +22,8 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xcopy(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xcopy(const int n,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
 
@@ -38,8 +38,8 @@ __kernel void Xcopy(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XcopyFast(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XcopyFast(const int n,
                         const __global realV* restrict xgm,
                         __global realV* ygm) {
   #pragma unroll
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index e13eb3c1..210aee94 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,8 +30,8 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xdot(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xdot(const int n,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    const __global real* restrict ygm, const int y_offset, const int y_inc,
                    __global real* output, const int do_conjugate) {
@@ -73,8 +73,8 @@ __kernel void Xdot(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XdotEpilogue(const __global real* restrict input,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XdotEpilogue(const __global real* restrict input,
                            __global real* dot, const int dot_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 9803687a..3633efea 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,8 +30,8 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xnrm2(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xnrm2(const int n,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* output) {
   __local real lm[WGS1];
@@ -72,8 +72,8 @@ __kernel void Xnrm2(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void Xnrm2Epilogue(const __global real* restrict input,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void Xnrm2Epilogue(const __global real* restrict input,
                             __global real* nrm2, const int nrm2_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 59936776..61ff5bb6 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,8 +22,8 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xscal(const int n, const real alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xscal(const int n, const real alpha,
                     __global real* xgm, const int x_offset, const int x_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
@@ -40,8 +40,8 @@ __kernel void Xscal(const int n, const real alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XscalFast(const int n, const real alpha,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XscalFast(const int n, const real alpha,
                         __global realV* xgm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index f6487b58..40336a67 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,8 +22,8 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xswap(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xswap(const int n,
                     __global real* xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc) {
 
@@ -40,8 +40,8 @@ __kernel void Xswap(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XswapFast(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XswapFast(const int n,
                         __global realV* xgm,
                         __global realV* ygm) {
   #pragma unroll
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 65b4291f..41d44dab 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,8 +210,8 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 // =================================================================================================
 
 // Full version of the kernel
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xgemv(const int m, const int n,
                     const __constant real* restrict arg_alpha,
                     const __constant real* restrict arg_beta,
                     const int a_rotated,
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 6a494e84..dae31a2b 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -94,8 +94,8 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XgemvFast(const int m, const int n,
                         const __constant real* restrict arg_alpha,
                         const __constant real* restrict arg_beta,
                         const int a_rotated,
@@ -196,8 +196,8 @@ __kernel void XgemvFast(const int m, const int n,
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n,
+__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+void XgemvFastRot(const int m, const int n,
                            const __constant real* restrict arg_alpha,
                            const __constant real* restrict arg_beta,
                            const int a_rotated,
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index 63817afb..21744799 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,8 +18,8 @@ R"(
 // =================================================================================================
 
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2,
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xger(const int max1, const int max2,
                    const __constant real* restrict arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    const __global real* ygm, const int y_offset, const int y_inc,
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index fc635f2e..4b304a9f 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,8 +18,8 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher(const int n,
                    const __constant real* restrict arg_alpha,
                    const __global real* restrict xgm, const int x_offset, const int x_inc,
                    __global real* restrict agm, const int a_offset, const int a_ld,
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index a66f255f..8d05f020 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,8 +18,8 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n,
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher2(const int n,
                     const __constant real* restrict arg_alpha,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     const __global real* restrict ygm, const int y_offset, const int y_inc,
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 53cc161a..272905eb 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -20,8 +20,8 @@ R"(
 
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermLowerToSquared(const int src_dim,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_dim,
@@ -59,8 +59,8 @@ __kernel void HermLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermUpperToSquared(const int src_dim,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_dim,
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index c6ce93ca..ea6f7dbd 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,8 +20,8 @@ R"(
 
 // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmLowerToSquared(const int src_dim,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_dim,
@@ -53,8 +53,8 @@ __kernel void SymmLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmUpperToSquared(const int src_dim,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_dim,
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index fdd2461a..858228bb 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,8 +20,8 @@ R"(
 
 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaLowerToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaLowerToSquared(const int src_dim,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_dim,
@@ -55,8 +55,8 @@ __kernel void TriaLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaUpperToSquared(const int src_dim,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaUpperToSquared(const int src_dim,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_dim,
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 09e54e6d..54f9d987 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,8 +35,8 @@ R"(
 
 // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrixFast(const int ld,
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void CopyMatrixFast(const int ld,
                              __global const realC* restrict src,
                              __global realC* dest,
                              const __constant real* restrict arg_alpha) {
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d276cc60..92279ecf 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,8 +24,8 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyPadMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
                             const int src_ld, const int src_offset,
                             __global const real* restrict src,
                             const int dest_one, const int dest_two,
@@ -65,8 +65,8 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
                          const int src_ld, const int src_offset,
                          __global const real* restrict src,
                          const int dest_one, const int dest_two,
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index d5c46a30..a2007408 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,8 +36,8 @@ R"(
 
 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrixFast(const int ld,
+__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+void TransposeMatrixFast(const int ld,
                                   __global const realT* restrict src,
                                   __global realT* dest,
                                   const __constant real* restrict arg_alpha) {
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2de0c7bd..63cc6e9a 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,8 +24,8 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposePadMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
                                  const int src_ld, const int src_offset,
                                  __global const real* restrict src,
                                  const int dest_one, const int dest_two,
@@ -88,8 +88,8 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposeMatrix(const int src_one, const int src_two,
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
                               const int src_ld, const int src_offset,
                               __global const real* restrict src,
                               const int dest_one, const int dest_two,
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 42c1127c..60e38c06 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -268,8 +268,8 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
 #if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
 
 // Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
                          const __constant real* restrict arg_alpha,
                          const __constant real* restrict arg_beta,
                          const __global realM* restrict agm,
@@ -308,8 +308,8 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
 }
 
 // Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
                          const __constant real* restrict arg_alpha,
                          const __constant real* restrict arg_beta,
                          const __global realM* restrict agm,
@@ -352,8 +352,8 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
 #else
 
 // Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
                     const __constant real* restrict arg_alpha,
                     const __constant real* restrict arg_beta,
                     const __global realM* restrict agm,
-- 
cgit v1.2.3


From 00979faab4cd83a1810e9faf3e9bffe36d535763 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 21 Aug 2016 20:16:06 +0200
Subject: Updated the changelog; refactored the database-get-bests code a bit

---
 CHANGELOG                             |  1 +
 scripts/database/database/bests.py    | 23 +++++++++++++++++++++++
 scripts/database/database/defaults.py | 24 ++++--------------------
 3 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 8fce1969..852b734f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,7 @@ Development version (next release)
 - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
 - Improved performance significantly of rotated GEMV computations
+- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
 
diff --git a/scripts/database/database/bests.py b/scripts/database/database/bests.py
index edb81733..e6239258 100644
--- a/scripts/database/database/bests.py
+++ b/scripts/database/database/bests.py
@@ -18,3 +18,26 @@ def get_best_results(df):
         best_parameters = database_entry[database_entry["time"] == best_time].iloc[0]
         database_bests = database_bests.append(best_parameters, ignore_index=True)
     return database_bests
+
+
+def get_relative_bests(df, parameter_column_names, name, verbose=False):
+    """Retrieves the relative best execution time over different devices"""
+
+    # Computes the sum of the execution times over the different devices
+    def sum_performance(x):
+        x["group_performance"] = x["relative_performance"].sum()
+        return x
+    df = df.groupby(parameter_column_names).apply(sum_performance)
+
+    # Retrieves the entries with the highest performance
+    best_performance = df["group_performance"].max()
+    df_bests = df[df["group_performance"] == best_performance]
+
+    # Retrieves one example only (the parameters are the same anyway)
+    df_bests = df_bests.drop_duplicates(["group_performance"])
+
+    # Completed, report and return the results
+    if verbose:
+        print("[database] " + str(name) + " with performance " + str(best_performance) + " " + str(df_bests.shape))
+    assert len(df_bests) == 1
+    return df_bests
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 985f24bd..3428d9a9 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -97,25 +97,9 @@ def get_common_best(database, group_name, verbose):
 
     # Fall back to another method in case there are no shared entries at all across devices
     if len(database_common) == 0:
-        # print("[database] Skipping: " + str(group_name) + " with devices: %d %d " % (num_devices, len(database)))
+        if verbose:
+            print("[database] No common kernels for: " + str(group_name) + " with devices: %d " % num_devices)
         return get_smallest_best(database)
 
-    # Computes the sum of the execution times over the different devices
-    def sum_performance(x):
-        x["group_performance"] = x["relative_performance"].sum()
-        return x
-    database_common = database_common.groupby(parameter_column_names).apply(sum_performance)
-
-    # Retrieves the entries with the highest performance
-    best_performance = database_common["group_performance"].max()
-    database_bests = database_common[database_common["group_performance"] == best_performance]
-
-    # Retrieves one example only (the parameters are the same anyway)
-    database_bests = database_bests.drop_duplicates(["group_performance"])
-
-    # Completed, report and return the results
-    if verbose:
-        print("[database] " + str(group_name) + " with performance " + str(best_performance) + " with devices: " +
-        str(num_devices) + " " + str(database_bests.shape))
-    assert len(database_bests) == 1
-    return database_bests
+    # Retrieves the entries with the highest relative performance
+    return bests.get_relative_bests(database_common, parameter_column_names, group_name, verbose)
-- 
cgit v1.2.3


From 84db8958d1238d7b171d83989d60c3605a4e2ba2 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 21 Aug 2016 20:28:02 +0200
Subject: Increased the ratio of GEMM tuning results to explore; reduced the
 tuning search space to have a better chance to evaluate more likely parameter
 combinations

---
 src/tuning/kernels/xgemm.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 898b8435..eb7c8a66 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -48,7 +48,7 @@ class TuneXgemm {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
-  static double DefaultFraction() { return 2048.0; }
+  static double DefaultFraction() { return 256.0; }
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -67,9 +67,9 @@ class TuneXgemm {
     tuner.AddParameter(id, "NDIMC", {8, 16, 32});
     tuner.AddParameter(id, "MDIMA", {8, 16, 32});
     tuner.AddParameter(id, "NDIMB", {8, 16, 32});
-    tuner.AddParameter(id, "KWI", {2, 8});
-    tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
-    tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
+    tuner.AddParameter(id, "KWI", {2});
+    tuner.AddParameter(id, "VWM", {1, 2, 4});
+    tuner.AddParameter(id, "VWN", {1, 2, 4});
     tuner.AddParameter(id, "STRM", {0, 1});
     tuner.AddParameter(id, "STRN", {0, 1});
     tuner.AddParameter(id, "SA", {0, 1});
-- 
cgit v1.2.3


From 0c0f0ac7f9b3b06244d5bdcde48fde4f9ffceb58 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 21 Aug 2016 20:35:20 +0200
Subject: Also changed the default-default for unknown device types to use the
 same method as for known device groups

---
 scripts/database/database/defaults.py   | 5 ++++-
 src/database/kernels/copy.hpp           | 8 ++++----
 src/database/kernels/pad.hpp            | 8 ++++----
 src/database/kernels/padtranspose.hpp   | 8 ++++----
 src/database/kernels/transpose.hpp      | 8 ++++----
 src/database/kernels/xaxpy.hpp          | 6 +++---
 src/database/kernels/xdot.hpp           | 8 ++++----
 src/database/kernels/xgemv.hpp          | 2 +-
 src/database/kernels/xgemv_fast_rot.hpp | 8 ++++----
 9 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 3428d9a9..8a02c201 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -51,7 +51,10 @@ def calculate_defaults(database, verbose, calculate_common_best=True):
     # Defaults over all device types and vendors
     groups = database.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES)
     for group_name, database_group in groups:
-        default_values = get_smallest_best(database_group)
+        if calculate_common_best:
+            default_values = get_common_best(database_group, group_name, verbose)
+        else:
+            default_values = get_smallest_best(database_group)
         default_values["device_vendor"] = clblast.VENDOR_DEFAULT
         default_values["device_type"] = clblast.DEVICE_TYPE_DEFAULT
         default_values = set_default_device(default_values)
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index e9902293..0ea2b495 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -93,7 +93,7 @@ const Database::DatabaseEntry Database::CopySingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
       }
     },
   }
@@ -154,7 +154,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
   }
@@ -213,7 +213,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
   }
@@ -272,7 +272,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index a242a827..d1643447 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -93,7 +93,7 @@ const Database::DatabaseEntry Database::PadSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
   }
@@ -162,7 +162,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
@@ -221,7 +221,7 @@ const Database::DatabaseEntry Database::PadDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
@@ -280,7 +280,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 0f63eafa..bee66a4d 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -93,7 +93,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
   }
@@ -162,7 +162,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
   }
@@ -221,7 +221,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
   }
@@ -280,7 +280,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
       }
     },
   }
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index d12d28f0..bf8ac665 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -93,7 +93,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
   }
@@ -156,7 +156,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
   }
@@ -215,7 +215,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
   }
@@ -268,7 +268,7 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 2f1dd638..78ff3288 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -93,7 +93,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",128}, {"WPT",1} } },
       }
     },
   }
@@ -162,7 +162,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
   }
@@ -280,7 +280,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 394c25de..51ab4099 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -75,7 +75,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
   }
@@ -126,7 +126,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
   }
@@ -168,7 +168,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
       }
     },
   }
@@ -210,7 +210,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index ba71893e..03b25bdc 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -196,7 +196,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index a57dcc66..42d3c5d1 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -43,7 +43,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
   }
@@ -74,7 +74,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
   }
@@ -104,7 +104,7 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
       }
     },
   }
@@ -128,7 +128,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
   }
-- 
cgit v1.2.3


From ea43936e94fdf2eabb913c1ebc1ac6143bde5bba Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Sat, 2 Jul 2016 02:38:48 +0300
Subject: test/correctness: read platform and device from environment

Support passing environment variables CLBLAST_PLATFORM and CLBLAST_DEVICE
instead of -platform and -device arguments to test executables.

This is for `ctest`.
---
 src/utilities.cpp           | 11 +++++++++++
 src/utilities.hpp           |  4 ++++
 test/correctness/tester.cpp |  5 +++--
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/utilities.cpp b/src/utilities.cpp
index 11a6c439..77bc72d7 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -161,6 +161,8 @@ template <typename T>
 T ConvertArgument(const char* value) {
   return static_cast<T>(std::stoi(value));
 }
+template size_t ConvertArgument(const char* value);
+
 template <> half ConvertArgument(const char* value) {
   return FloatToHalf(static_cast<float>(std::stod(value)));
 }
@@ -179,6 +181,15 @@ template <> double2 ConvertArgument(const char* value) {
   return double2{val, val};
 }
 
+// Variant of "ConvertArgument" with default values
+template <typename T>
+T ConvertArgument(const char* value, T default_value) {
+
+  if (value) { return ConvertArgument<T>(value); }
+  return default_value;
+}
+template size_t ConvertArgument(const char* value, size_t default_value);
+
 // This function matches patterns in the form of "-option value" or "--option value". It returns a
 // default value in case the option is not found in the argument string.
 template <typename T>
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 700d30d6..75bd5a69 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -187,6 +187,10 @@ std::string ToString(T value);
 template <typename T>
 T ConvertArgument(const char* value);
 
+// Variant of "ConvertArgument" with default values
+template <typename T>
+T ConvertArgument(const char* value, T default_value);
+
 // Basic argument parser, matching patterns in the form of "-option value" and "--option value"
 template <typename T>
 T GetArgument(const int argc, char **argv, std::string &help,
diff --git a/test/correctness/tester.cpp b/test/correctness/tester.cpp
index 92e2c1b8..362c5c2c 100644
--- a/test/correctness/tester.cpp
+++ b/test/correctness/tester.cpp
@@ -15,6 +15,7 @@
 #include <vector>
 #include <iostream>
 #include <cmath>
+#include <cstdlib>
 
 #include "test/correctness/tester.hpp"
 
@@ -27,8 +28,8 @@ template <typename T, typename U>
 Tester<T,U>::Tester(int argc, char *argv[], const bool silent,
                     const std::string &name, const std::vector<std::string> &options):
     help_("Options given/available:\n"),
-    platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, size_t{0}))),
-    device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, size_t{0}))),
+    platform_(Platform(GetArgument(argc, argv, help_, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})))),
+    device_(Device(platform_, GetArgument(argc, argv, help_, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})))),
     context_(Context(device_)),
     queue_(Queue(context_, device_)),
     full_test_(CheckArgument(argc, argv, help_, kArgFullTest)),
-- 
cgit v1.2.3


From 19574b2519e723cbca0b0fa0964e8e5823c24911 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 3 Sep 2016 12:44:11 +0200
Subject: Updated tuning results for Haswell GT2 Mobile GPU; fixed database
 script to handle duplicate entries of different runs

---
 scripts/database/database/defaults.py   |  9 +++++++++
 src/database/kernels/copy.hpp           |  4 ++--
 src/database/kernels/pad.hpp            |  6 +++---
 src/database/kernels/transpose.hpp      |  2 +-
 src/database/kernels/xaxpy.hpp          |  4 ++--
 src/database/kernels/xdot.hpp           |  4 ++--
 src/database/kernels/xgemv.hpp          |  4 ++--
 src/database/kernels/xgemv_fast.hpp     |  6 +++---
 src/database/kernels/xgemv_fast_rot.hpp |  6 ++++--
 src/database/kernels/xger.hpp           | 22 +++++++++++-----------
 10 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 8a02c201..73ad2bcf 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -84,6 +84,15 @@ def get_common_best(database, group_name, verbose):
     database = database.dropna(axis=1, how='all')
     database = database.reset_index()
 
+    # In case multiple runs for the exact same configuration where made: take just the best performing one into account
+    other_column_names = list(database.columns.values)
+    other_column_names.remove("time")
+    database_by_time = database.groupby(other_column_names)
+    if len(database_by_time) != len(database):
+        if verbose:
+            print("[database] " + str(group_name) + " keeping only entries with the lowest execution time")
+        database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
+
     # Inserts the relative execution times into the database
     def relative_performance(x):
         x["relative_performance"] = x["time"].min() / x["time"]
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 0ea2b495..7639913b 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -61,11 +61,11 @@ const Database::DatabaseEntry Database::CopySingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index d1643447..8d541db0 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -61,11 +61,11 @@ const Database::DatabaseEntry Database::PadSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -93,7 +93,7 @@ const Database::DatabaseEntry Database::PadSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
   }
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index bf8ac665..69bbd950 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -61,7 +61,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 78ff3288..7e1e5912 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -61,7 +61,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -134,7 +134,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",8} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",2} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 51ab4099..e0a57a61 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -53,7 +53,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
@@ -126,7 +126,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 03b25bdc..4eda857e 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -54,7 +54,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
         { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
@@ -86,7 +86,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 7e948540..173f09aa 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -58,7 +58,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
       }
     },
     { // Intel accelerators
@@ -86,7 +86,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
       }
     },
   }
@@ -120,7 +120,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 42d3c5d1..7aa43b82 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -30,9 +30,10 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // NVIDIA GPUs
@@ -67,6 +68,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
@@ -74,7 +76,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
   }
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 06f65c46..5903e090 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -54,16 +54,16 @@ const Database::DatabaseEntry Database::XgerSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
       }
     },
     { // NVIDIA GPUs
@@ -80,7 +80,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
       }
     },
   }
@@ -110,13 +110,13 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
@@ -136,7 +136,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",4}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
   }
@@ -166,7 +166,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
@@ -183,7 +183,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
   }
@@ -213,7 +213,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
@@ -230,7 +230,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
   }
-- 
cgit v1.2.3


From 521bf6cdfc650f82488c1e07918eeabd7b328a78 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 3 Sep 2016 16:43:23 +0200
Subject: Added tuning results for Intel Broadwell 5500 GT2 GPU

---
 README.md                               | 1 +
 scripts/database/database/defaults.py   | 3 +--
 src/database/kernels/copy.hpp           | 5 ++++-
 src/database/kernels/pad.hpp            | 3 +++
 src/database/kernels/padtranspose.hpp   | 3 +++
 src/database/kernels/transpose.hpp      | 5 ++++-
 src/database/kernels/xaxpy.hpp          | 5 ++++-
 src/database/kernels/xdot.hpp           | 5 ++++-
 src/database/kernels/xgemm.hpp          | 2 ++
 src/database/kernels/xgemv.hpp          | 3 +++
 src/database/kernels/xgemv_fast.hpp     | 9 ++++++---
 src/database/kernels/xgemv_fast_rot.hpp | 2 ++
 src/database/kernels/xger.hpp           | 9 ++++++---
 13 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index feff1d40..1dd3ea65 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,7 @@ The CLBlast library will be tuned in the future for the most commonly used OpenC
 * Intel GPUs:
   - HD Graphics 530
   - HD Graphics Haswell Ultrabook GT2 Mobile
+  - HD Graphics 5500 BroadWell U-Processor GT2
   - HD Graphics Skylake ULT GT2
   - Iris
   - Iris Pro
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 73ad2bcf..3bde33c1 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -45,8 +45,7 @@ def calculate_defaults(database, verbose, calculate_common_best=True):
     groups = database_defaults.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"])
     for group_name, database_group in groups:
         if len(database_group) != 1:
-            description = database_group["kernel"].min() + " " + database_group["device_vendor"].min()
-            print("[WARNING] Entries for a single kernel with multiple argument values: " + description)
+            print("[WARNING] Entries for a single kernel with multiple argument values: " + str(group_name))
 
     # Defaults over all device types and vendors
     groups = database.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES)
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 7639913b..dc2011fd 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
   "Copy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
       }
@@ -61,6 +62,7 @@ const Database::DatabaseEntry Database::CopySingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -124,11 +126,12 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
       }
     },
     { // Intel accelerators
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 8d541db0..3cfabaf4 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadHalf = {
   "Pad", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
@@ -61,6 +62,7 @@ const Database::DatabaseEntry Database::PadSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -130,6 +132,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index bee66a4d..88bd4ea7 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
   "Padtranspose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
       }
@@ -61,6 +62,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -130,6 +132,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 69bbd950..0e1b608e 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
   "Transpose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
@@ -61,11 +62,12 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -130,6 +132,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 7e1e5912..60fa7555 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
   "Xaxpy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",4}, {"WGS",512}, {"WPT",8} } },
         { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
       }
@@ -61,6 +62,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -93,7 +95,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
       }
     },
   }
@@ -130,6 +132,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index e0a57a61..f862d00e 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
   "Xdot", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
       }
@@ -53,6 +54,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
@@ -104,6 +106,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
@@ -126,7 +129,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 61b7ff05..c960592d 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -43,6 +43,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
@@ -112,6 +113,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 4eda857e..7e8e64e3 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgemvHalf = {
   "Xgemv", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
@@ -54,6 +55,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
@@ -116,6 +118,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 173f09aa..f5e3e630 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {
   "XgemvFast", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
         { "default",                                         { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
       }
@@ -54,11 +55,12 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
       }
     },
     { // Intel accelerators
@@ -116,11 +118,12 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
       }
     },
     { // Intel accelerators
@@ -142,7 +145,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 7aa43b82..3d2e0d3a 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -30,6 +30,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
@@ -68,6 +69,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 5903e090..3e9c25c1 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
   "Xger", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
       }
@@ -60,10 +61,11 @@ const Database::DatabaseEntry Database::XgerSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
@@ -116,10 +118,11 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
@@ -136,7 +139,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
       }
     },
   }
-- 
cgit v1.2.3


From b30b26b89e52eceb06f5661622c3de0312206ab4 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 4 Sep 2016 17:21:16 +0200
Subject: The GEMM kernel no longer adds beta*C in case beta is zero; this
 would cause problems if C contains NaNs

---
 CHANGELOG                             |   5 +-
 src/kernels/common.opencl             |   7 ++
 src/kernels/level3/xgemm_part2.opencl | 124 +++++++++++++++++++++++-----------
 3 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 9b027e6d..10cde25d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,15 +1,16 @@
 
 Development version (next release)
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
+- Improved performance significantly of rotated GEMV computations
+- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection
 - Fixed proper MSVC dllimport and dllexport declarations
 - Fixed memory leaks related to events not being released
 - Fixed a bug with a size_t and cl_ulong mismatch on 32-bit systems
 - Fixed a bug related to the cache and retrieval of programs based on the OpenCL context
 - Fixed a performance issue (caused by fp16 support) by optimizing alpha/beta parameter passing to kernels
 - Fixed a bug in the OpenCL kernels: now placing __kernel before __attribute__
+- Fixed a bug in level-3 routines when beta is zero and matrix C contains NaNs
 - Added an option (-warm_up) to do a warm-up run before timing in the performance clients
-- Improved performance significantly of rotated GEMV computations
-- Improved performance of unseen/un-tuned devices by a better default tuning parameter selection
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see README)
 
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 9d2bb65e..223501fd 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -148,6 +148,13 @@ R"(
   #define SetToOne(a) a = ONE
 #endif
 
+// Determines whether a variable is zero
+#if PRECISION == 3232 || PRECISION == 6464
+  #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
+#else
+  #define IsZero(a) (a == ZERO)
+#endif
+
 // The absolute value (component-wise)
 #if PRECISION == 3232 || PRECISION == 6464
   #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index a1559b54..faf17e49 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -133,49 +133,93 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
       #endif
       int idm = mg + GetGroupID0() * (MWG/VWM);
       int idn = ng + GetGroupID1() * NWG;
-
-      // The final multiplication with alpha and the addition with beta*C
       int index = idn*(kSizeM/VWM) + idm;
+
       realM result;
       realM xval = cpm[ni][mi];
-      realM yval = cgm[index];
-      #if VWM == 1
-        AXPBY(result, alpha, xval, beta, yval);
-      #elif VWM == 2
-        AXPBY(result.x, alpha, xval.x, beta, yval.x);
-        AXPBY(result.y, alpha, xval.y, beta, yval.y);
-      #elif VWM == 4
-        AXPBY(result.x, alpha, xval.x, beta, yval.x);
-        AXPBY(result.y, alpha, xval.y, beta, yval.y);
-        AXPBY(result.z, alpha, xval.z, beta, yval.z);
-        AXPBY(result.w, alpha, xval.w, beta, yval.w);
-      #elif VWM == 8
-        AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
-        AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
-        AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
-        AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
-        AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
-        AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
-        AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
-        AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
-      #elif VWM == 16
-        AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
-        AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
-        AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
-        AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
-        AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
-        AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
-        AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
-        AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
-        AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
-        AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
-        AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
-        AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
-        AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
-        AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
-        AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
-        AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
-      #endif
+
+      // The final multiplication with alpha (in case beta == 0)
+      if (IsZero(beta)) {
+        #if VWM == 1
+          Multiply(result, alpha, xval);
+        #elif VWM == 2
+          Multiply(result.x, alpha, xval.x);
+          Multiply(result.y, alpha, xval.y);
+        #elif VWM == 4
+          Multiply(result.x, alpha, xval.x);
+          Multiply(result.y, alpha, xval.y);
+          Multiply(result.z, alpha, xval.z);
+          Multiply(result.w, alpha, xval.w);
+        #elif VWM == 8
+          Multiply(result.s0, alpha, xval.s0);
+          Multiply(result.s1, alpha, xval.s1);
+          Multiply(result.s2, alpha, xval.s2);
+          Multiply(result.s3, alpha, xval.s3);
+          Multiply(result.s4, alpha, xval.s4);
+          Multiply(result.s5, alpha, xval.s5);
+          Multiply(result.s6, alpha, xval.s6);
+          Multiply(result.s7, alpha, xval.s7);
+        #elif VWM == 16
+          Multiply(result.s0, alpha, xval.s0);
+          Multiply(result.s1, alpha, xval.s1);
+          Multiply(result.s2, alpha, xval.s2);
+          Multiply(result.s3, alpha, xval.s3);
+          Multiply(result.s4, alpha, xval.s4);
+          Multiply(result.s5, alpha, xval.s5);
+          Multiply(result.s6, alpha, xval.s6);
+          Multiply(result.s7, alpha, xval.s7);
+          Multiply(result.s8, alpha, xval.s8);
+          Multiply(result.s9, alpha, xval.s9);
+          Multiply(result.sA, alpha, xval.sA);
+          Multiply(result.sB, alpha, xval.sB);
+          Multiply(result.sC, alpha, xval.sC);
+          Multiply(result.sD, alpha, xval.sD);
+          Multiply(result.sE, alpha, xval.sE);
+          Multiply(result.sF, alpha, xval.sF);
+        #endif
+      }
+
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        realM yval = cgm[index];
+        #if VWM == 1
+          AXPBY(result, alpha, xval, beta, yval);
+        #elif VWM == 2
+          AXPBY(result.x, alpha, xval.x, beta, yval.x);
+          AXPBY(result.y, alpha, xval.y, beta, yval.y);
+        #elif VWM == 4
+          AXPBY(result.x, alpha, xval.x, beta, yval.x);
+          AXPBY(result.y, alpha, xval.y, beta, yval.y);
+          AXPBY(result.z, alpha, xval.z, beta, yval.z);
+          AXPBY(result.w, alpha, xval.w, beta, yval.w);
+        #elif VWM == 8
+          AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+          AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+          AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+          AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+          AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+          AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+          AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+          AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+        #elif VWM == 16
+          AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+          AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+          AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+          AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+          AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+          AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+          AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+          AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+          AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
+          AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
+          AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
+          AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
+          AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
+          AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
+          AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
+          AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
+        #endif
+      }
       cgm[index] = result;
     }
   }
-- 
cgit v1.2.3


From a2f83507033a20b534099c7b21d4a7466108e949 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 4 Sep 2016 21:26:12 +0200
Subject: Refactored the Python C++ generator script; now confirms to the PEP8
 styleguide

---
 scripts/generator/datatype.py           |  70 ----
 scripts/generator/generator.py          | 664 ++++++++------------------------
 scripts/generator/generator/__init__.py |   0
 scripts/generator/generator/convert.py  |  69 ++++
 scripts/generator/generator/cpp.py      | 257 ++++++++++++
 scripts/generator/generator/datatype.py |  92 +++++
 scripts/generator/generator/doc.py      |  57 +++
 scripts/generator/generator/routine.py  | 552 ++++++++++++++++++++++++++
 scripts/generator/routine.py            | 603 -----------------------------
 9 files changed, 1191 insertions(+), 1173 deletions(-)
 delete mode 100644 scripts/generator/datatype.py
 create mode 100644 scripts/generator/generator/__init__.py
 create mode 100644 scripts/generator/generator/convert.py
 create mode 100644 scripts/generator/generator/cpp.py
 create mode 100644 scripts/generator/generator/datatype.py
 create mode 100644 scripts/generator/generator/doc.py
 create mode 100644 scripts/generator/generator/routine.py
 delete mode 100644 scripts/generator/routine.py

diff --git a/scripts/generator/datatype.py b/scripts/generator/datatype.py
deleted file mode 100644
index 5bff95d1..00000000
--- a/scripts/generator/datatype.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env python
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file contains the 'DataType' class, used in the generator script to generate the CLBlast API
-# interface and implementation.
-#
-# ==================================================================================================
-
-# Short-hands for data-types
-HLF = "half"
-FLT = "float"
-DBL = "double"
-FLT2 = "float2"
-DBL2 = "double2"
-
-HCL = "cl_half"
-F2CL = "cl_float2"
-D2CL = "cl_double2"
-
-# Structure holding data-type and precision information
-class DataType():
-	def __init__(self, precision_name, name, template, scalars, buffertype):
-		self.precision_name = precision_name
-		self.name = name
-		self.template = template
-		self.alpha_cpp = scalars[0]
-		self.beta_cpp = scalars[1]
-		self.alpha_cl = scalars[2]
-		self.beta_cl = scalars[3]
-		self.buffertype = buffertype
-
-	# Outputs the name of the data-type (alpha/beta), possibly transforming into the right type
-	def UseAlpha(self):
-		if self.alpha_cpp in [FLT2, DBL2]:
-			return self.alpha_cpp+"{alpha.s[0], alpha.s[1]}"
-		return "alpha"
-	def UseBeta(self):
-		if self.beta_cpp in [FLT2, DBL2]:
-			return self.beta_cpp+"{beta.s[0], beta.s[1]}"
-		return "beta"
-
-	# As above, but the transformation is in the opposite direction
-	def UseAlphaCL(self):
-		if self.alpha_cpp in [FLT2, DBL2]:
-			return self.alpha_cl+"{{alpha.real(), alpha.imag()}}"
-		return "alpha"
-	def UseBetaCL(self):
-		if self.beta_cpp in [FLT2, DBL2]:
-			return self.beta_cl+"{{beta.real(), beta.imag()}}"
-		return "beta"
-
-	# Returns the template as used in the correctness/performance tests
-	def TestTemplate(self):
-		if self.buffertype != self.beta_cpp:
-			return "<"+self.buffertype+","+self.beta_cpp+">, "+self.buffertype+", "+self.beta_cpp
-		return "<"+self.buffertype+">, "+self.buffertype+", "+self.beta_cpp
-
-	# Current scalar is complex
-	def IsComplex(self, scalar):
-		return ((scalar == "alpha" and self.alpha_cpp in [FLT2, DBL2]) or
-		        (scalar == "beta" and self.beta_cpp in [FLT2, DBL2]))
-
-
-# ==================================================================================================
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 6aa6fc18..d82b13a6 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -1,14 +1,13 @@
 #!/usr/bin/env python
 
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
 #
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
 #
-# This script automatically generates the bodies of the following files, creating the full CLBlast
-# API interface and implementation (C, C++, and reference BLAS wrappers):
+# This script automatically generates the bodies of the following files, creating the full CLBlast API interface and
+# implementation (C, C++, and reference BLAS wrappers):
 #    clblast.h
 #    clblast.cpp
 #    clblast_c.h
@@ -19,45 +18,20 @@
 #    test/correctness/routines/levelX/xYYYY.cpp
 #    test/performance/routines/levelX/xYYYY.cpp
 # It also produces the API documentation found in doc/clblast.md
-#
-# ==================================================================================================
 
-# System modules
+
 import sys
 import os.path
+import argparse
 
-# Local files
-from routine import Routine
-from datatype import DataType, HLF, FLT, DBL, FLT2, DBL2, HCL, F2CL, D2CL
+import generator.cpp as cpp
+import generator.doc as doc
+from generator.routine import Routine
+from generator.datatype import H, S, D, C, Z, Sc, Dz, iH, iS, iD, iC, iZ, Css, Zdd, Ccs, Zzd, T, Tc, TU
 
-# ==================================================================================================
 
-# Regular data-types
-H = DataType("H", "H", HLF,  [HLF,  HLF,  HCL,  HCL],  HLF ) # half (16)
-S = DataType("S", "S", FLT,  [FLT,  FLT,  FLT,  FLT],  FLT ) # single (32)
-D = DataType("D", "D", DBL,  [DBL,  DBL,  DBL,  DBL],  DBL ) # double (64)
-C = DataType("C", "C", FLT2, [FLT2, FLT2, F2CL, F2CL], FLT2) # single-complex (3232)
-Z = DataType("Z", "Z", DBL2, [DBL2, DBL2, D2CL, D2CL], DBL2) # double-complex (6464)
-
-# Special cases
-Sc = DataType("C", "Sc", FLT2,         [FLT2, FLT2, FLT2, FLT2], FLT2) # As C, but with real output
-Dz = DataType("Z", "Dz", DBL2,         [DBL2, DBL2, DBL2, DBL2], DBL2) # As Z, but with real output
-iH = DataType("H", "iH", HLF,          [HLF,  HLF,  HLF,  HLF],  HLF ) # As H, but with integer output
-iS = DataType("S", "iS", FLT,          [FLT,  FLT,  FLT,  FLT],  FLT ) # As S, but with integer output
-iD = DataType("D", "iD", DBL,          [DBL,  DBL,  DBL,  DBL],  DBL ) # As D, but with integer output
-iC = DataType("C", "iC", FLT2,         [FLT2, FLT2, F2CL, F2CL], FLT2) # As C, but with integer output
-iZ = DataType("Z", "iZ", DBL2,         [DBL2, DBL2, D2CL, D2CL], DBL2) # As Z, but with integer output
-Css = DataType("C", "C", FLT,          [FLT,  FLT,  FLT,  FLT], FLT2) # As C, but with constants from S
-Zdd = DataType("Z", "Z", DBL,          [DBL,  DBL,  DBL,  DBL], DBL2) # As Z, but with constants from D
-Ccs = DataType("C", "C", FLT2+","+FLT, [FLT2, FLT,  F2CL, FLT], FLT2) # As C, but with one constant from S
-Zzd = DataType("Z", "Z", DBL2+","+DBL, [DBL2, DBL,  D2CL, DBL], DBL2) # As Z, but with one constant from D
-
-# C++ template data-types
-T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T") # regular routine
-Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>") # for herk
-TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T") # for her2k
-
-# ==================================================================================================
+HEADER_LINES = [96, 73, 97, 22, 29, 41]
+FOOTER_LINES = [17, 75, 19, 14, 6, 6]
 
 # Different possibilities for requirements
 ald_m = "The value of `a_ld` must be at least `m`."
@@ -77,472 +51,162 @@ cld_n = "The value of `c_ld` must be at least `n`."
 # ==================================================================================================
 
 # Populates a list of routines
-routines = [
-[ # Level 1: vector-vector
-  Routine(False, True,  "1", "rotg",  T, [S,D],            [], [], [], ["sa","sb","sc","ss"], [], "", "Generate givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotmg", T, [S,D],            [], [], ["sy1"], ["sd1","sd2","sx1","sparam"], [], "", "Generate modified givens plane rotation", "", []),
-  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"], [], [], ["x","y"], ["cos","sin"], "", "Apply givens plane rotation", "", []),
-  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"], [], [], ["x","y","sparam"], [], "", "Apply modified givens plane rotation", "", []),
-  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"], [], [], ["x","y"], [], "", "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
-  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"], [], [], ["x"], ["alpha"], "", "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
-  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"], [], ["x"], ["y"], [], "", "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
-  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"], [], ["x"], ["y"], ["alpha"], "", "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
-  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
-  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"], [], ["x","y"], ["dot"], [], "n", "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
-  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"], [], ["x"], ["nrm2"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
-  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"], [], ["x"], ["asum"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
-  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"], [], ["x"], ["sum"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+ROUTINES = [
+[  # Level 1: vector-vector
+  Routine(False, True,  "1", "rotg",  T, [S,D],            [],                  [],                                                     [],         ["sa","sb","sc","ss"],        [],               "",    "Generate givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotmg", T, [S,D],            [],                  [],                                                     ["sy1"],    ["sd1","sd2","sx1","sparam"], [],               "",    "Generate modified givens plane rotation", "", []),
+  Routine(False, True,  "1", "rot",   T, [S,D],            ["n"],               [],                                                     [],         ["x","y"],                    ["cos","sin"],    "",    "Apply givens plane rotation", "", []),
+  Routine(False, True,  "1", "rotm",  T, [S,D],            ["n"],               [],                                                     [],         ["x","y","sparam"],           [],               "",    "Apply modified givens plane rotation", "", []),
+  Routine(True,  True,  "1", "swap",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x","y"],                    [],               "",    "Swap two vectors", "Interchanges _n_ elements of vectors _x_ and _y_.", []),
+  Routine(True,  True,  "1", "scal",  T, [S,D,C,Z,H],      ["n"],               [],                                                     [],         ["x"],                        ["alpha"],        "",    "Vector scaling", "Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.", []),
+  Routine(True,  True,  "1", "copy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        [],               "",    "Vector copy", "Copies the contents of vector _x_ into vector _y_.", []),
+  Routine(True,  True,  "1", "axpy",  T, [S,D,C,Z,H],      ["n"],               [],                                                     ["x"],      ["y"],                        ["alpha"],        "",    "Vector-times-constant plus vector", "Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.", []),
+  Routine(True,  True,  "1", "dot",   T, [S,D,H],          ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two vectors", "Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.", []),
+  Routine(True,  True,  "1", "dotu",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two complex vectors", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "dotc",  T, [C,Z],            ["n"],               [],                                                     ["x","y"],  ["dot"],                      [],               "n",   "Dot product of two complex vectors, one conjugated", "See the regular xDOT routine.", []),
+  Routine(True,  True,  "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [],               "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
+  Routine(True,  True,  "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [],               "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
+  Routine(True,  False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [],               "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
+  Routine(True,  True,  "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [],               "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [],               "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [],               "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
-[ # Level 2: matrix-vector
-  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
-  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H], ["m","n","kl","ku"], ["layout","a_transpose"], ["a","x"], ["y"], ["alpha","beta"], "", "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
-  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],       ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
-  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],       ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],       ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
-  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],     ["n","k"], ["layout","triangle"], ["a","x"], ["y"], ["alpha","beta"], "", "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["ap","x"], ["y"], ["alpha","beta"], "", "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
-  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "n", "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
-  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "n", "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
-  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],   ["n"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a triangular system of equations", "", []),
-  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],   ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", [ald_k_one]),
-  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],   ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
+[  # Level 2: matrix-vector
+  Routine(True,  True,  "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
+  Routine(True,  True,  "2a", "gbmv",  T,  [S,D,C,Z,H],    ["m","n","kl","ku"], ["layout","a_transpose"],                               ["a","x"],  ["y"],                        ["alpha","beta"], "",    "General banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is banded instead.", [ald_kl_ku_one]),
+  Routine(True,  True,  "2a", "hemv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Hermitian matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.", [ald_n]),
+  Routine(True,  True,  "2a", "hbmv",  T,  [C,Z],          ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Hermitian banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "hpmv",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        ["alpha","beta"], "",    "Hermitian packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "symv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Symmetric matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric instead.", [ald_n]),
+  Routine(True,  True,  "2a", "sbmv",  T,  [S,D,H],        ["n","k"],           ["layout","triangle"],                                  ["a","x"],  ["y"],                        ["alpha","beta"], "",    "Symmetric banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "spmv",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["ap","x"], ["y"],                        ["alpha","beta"], "",    "Symmetric packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2a", "trmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "n",   "Triangular matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular instead.", [ald_n]),
+  Routine(True,  True,  "2a", "tbmv",  T,  [S,D,C,Z,H],    ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "n",   "Triangular banded matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is triangular and banded instead.", [ald_k_one]),
+  Routine(True,  True,  "2a", "tpmv",  T,  [S,D,C,Z,H],    ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [],               "n",   "Triangular packed matrix-vector multiplication", "Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.", []),
+  Routine(False, True,  "2a", "trsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "",    "Solves a triangular system of equations", "", []),
+  Routine(False, True,  "2a", "tbsv",  T,  [S,D,C,Z],      ["n","k"],           ["layout","triangle","a_transpose","diagonal"],         ["a"],      ["x"],                        [],               "",    "Solves a banded triangular system of equations", "", [ald_k_one]),
+  Routine(False, True,  "2a", "tpsv",  T,  [S,D,C,Z],      ["n"],               ["layout","triangle","a_transpose","diagonal"],         ["ap"],     ["x"],                        [],               "",    "Solves a packed triangular system of equations", "", []),
   # Level 2: matrix update
-  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],     ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
-  Routine(True,  True,  "2b", "geru",  T,  [C,Z],       ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
-  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],       ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
-  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],   ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],   ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "her2",  T,  [C,Z],       ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
-  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],       ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],     ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],     ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
-  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
-  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],     ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "ger",   T,  [S,D,H],        ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 matrix update", "Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.", [ald_m]),
+  Routine(True,  True,  "2b", "geru",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 complex matrix update", "Same operation as xGER, but with complex data-types.", [ald_m]),
+  Routine(True,  True,  "2b", "gerc",  T,  [C,Z],          ["m","n"],           ["layout"],                                             ["x","y"],  ["a"],                        ["alpha"],        "",    "General rank-1 complex conjugated matrix update", "Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.", [ald_m]),
+  Routine(True,  True,  "2b", "her",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        ["alpha"],        "",    "Hermitian rank-1 matrix update", "Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr",   Tc, [Css,Zdd],      ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       ["alpha"],        "",    "Hermitian packed rank-1 matrix update", "Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "her2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        ["alpha"],        "",    "Hermitian rank-2 matrix update", "Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.", [ald_n]),
+  Routine(True,  True,  "2b", "hpr2",  T,  [C,Z],          ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       ["alpha"],        "",    "Hermitian packed rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["a"],                        ["alpha"],        "",    "Symmetric rank-1 matrix update", "Same operation as xHER, but matrix A is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr",   T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x"],      ["ap"],                       ["alpha"],        "",    "Symmetric packed rank-1 matrix update", "Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
+  Routine(True,  True,  "2b", "syr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["a"],                        ["alpha"],        "",    "Symmetric rank-2 matrix update", "Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.", [ald_n]),
+  Routine(True,  True,  "2b", "spr2",  T,  [S,D,H],        ["n"],               ["layout","triangle"],                                  ["x","y"],  ["ap"],                       ["alpha"],        "",    "Symmetric packed rank-2 matrix update", "Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.", []),
 ],
-[ # Level 3: matrix-matrix
-  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
-  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "hemm",  T,  [C,Z],       ["m","n"], ["layout","side","triangle"], ["a","b"], ["c"], ["alpha","beta"], "", "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
-  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H], ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],   ["n","k"], ["layout","triangle","a_transpose"], ["a"], ["c"], ["alpha","beta"], "", "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
-  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H], ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],   ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
-  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
-  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+[  # Level 3: matrix-matrix
+  Routine(True,  True,  "3", "gemm",  T,  [S,D,C,Z,H],     ["m","n","k"],        ["layout","a_transpose","b_transpose"],                ["a","b"],  ["c"],                        ["alpha","beta"], "",    "General matrix-matrix multiplication", "Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.", [ald_transa_m_k, bld_transb_k_n, cld_m]),
+  Routine(True,  True,  "3", "symm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Symmetric matrix-matrix multiplication", "Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "hemm",  T,  [C,Z],           ["m","n"],            ["layout","side","triangle"],                          ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Hermitian matrix-matrix multiplication", "Same operation as xSYMM, but _A_ is an Hermitian matrix instead.", [ald_side_m_n, bld_m, cld_m]),
+  Routine(True,  True,  "3", "syrk",  T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        ["alpha","beta"], "",    "Rank-K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "herk",  Tc, [Css,Zdd],       ["n","k"],            ["layout","triangle","a_transpose"],                   ["a"],      ["c"],                        ["alpha","beta"], "",    "Rank-K update of a hermitian matrix", "Same operation as xSYRK, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, cld_m]),
+  Routine(True,  True,  "3", "syr2k", T,  [S,D,C,Z,H],     ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Rank-2K update of a symmetric matrix", "Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],       ["n","k"],            ["layout","triangle","ab_transpose"],                  ["a","b"],  ["c"],                        ["alpha","beta"], "",    "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
+  Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        ["alpha"],        "",    "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
+  Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H],     ["m","n"],            ["layout","side","triangle","a_transpose","diagonal"], ["a"],      ["b"],                        ["alpha"],        "",    "Solves a triangular system of equations", "", []),
 ],
-[ # Level X: extra routines (not part of BLAS)
-  Routine(True,  True,  "x", "omatcopy", T,  [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
+[  # Level X: extra routines (not part of BLAS)
+  Routine(True,  True,  "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
 ]]
 
-# ==================================================================================================
-# Translates an option name to a CLBlast data-type
-def PrecisionToFullName(x):
-	return {
-		'H': "Half",
-		'S': "Single",
-		'D': "Double",
-		'C': "ComplexSingle",
-		'Z': "ComplexDouble",
-	}[x]
-
-# ==================================================================================================
-
-# Separators for the BLAS levels
-separators = ["""
-// =================================================================================================
-// BLAS level-1 (vector-vector) routines
-// =================================================================================================""",
-"""
-// =================================================================================================
-// BLAS level-2 (matrix-vector) routines
-// =================================================================================================""",
-"""
-// =================================================================================================
-// BLAS level-3 (matrix-matrix) routines
-// =================================================================================================""",
-"""
-// =================================================================================================
-// Extra non-BLAS routines (level-X)
-// ================================================================================================="""]
-
-# Names of the level sub-folders
-levelnames = ["1", "2", "3", "x"]
-
-# Main header/footer for source files
-header = """
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// =================================================================================================
-"""
-footer = """
-// =================================================================================================
-"""
-
-# ==================================================================================================
-
-# The C++ API header (.h)
-def clblast_h(routines):
-	result = ""
-	for routine in routines:
-		result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
-		result += routine.RoutineHeaderCPP(12, " = nullptr")+";\n"
-	return result
-
-# The C++ API implementation (.cpp)
-def clblast_cc(routines):
-	result = ""
-	for routine in routines:
-		indent1 = " "*(20 + routine.Length())
-		result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
-		if routine.implemented:
-			result += routine.RoutineHeaderCPP(12, "")+" {\n"
-			result += "  auto queue_cpp = Queue(*queue);\n"
-			result += "  auto routine = X"+routine.name+"<"+routine.template.template+">(queue_cpp, event);\n"
-			result += "  auto status = routine.SetUp();\n"
-			result += "  if (status != StatusCode::kSuccess) { return status; }\n"
-			result += "  return routine.Do"+routine.name.capitalize()+"("
-			result += (",\n"+indent1).join([a for a in routine.ArgumentsCladuc(routine.template, indent1)])
-			result += ");\n"
-		else:
-			result += routine.RoutineHeaderTypeCPP(12)+" {\n"
-			result += "  return StatusCode::kNotImplemented;\n"
-		result += "}\n"
-		for flavour in routine.flavours:
-			indent2 = " "*(34 + routine.Length() + len(flavour.template))
-			result += "template StatusCode PUBLIC_API "+routine.name.capitalize()+"<"+flavour.template+">("
-			result += (",\n"+indent2).join([a for a in routine.ArgumentsType(flavour)])
-			result += ",\n"+indent2+"cl_command_queue*, cl_event*);\n"
-	return result
-
-# ==================================================================================================
-
-# The C API header (.h)
-def clblast_c_h(routines):
-	result = ""
-	for routine in routines:
-		result += "\n// "+routine.description+": "+routine.ShortNames()+"\n"
-		for flavour in routine.flavours:
-			result += routine.RoutineHeaderC(flavour, 31, " PUBLIC_API")+";\n"
-	return result
-
-# The C API implementation (.cpp)
-def clblast_c_cc(routines):
-	result = ""
-	for routine in routines:
-		result += "\n// "+routine.name.upper()+"\n"
-		for flavour in routine.flavours:
-			template = "<"+flavour.template+">" if routine.NoScalars() else ""
-			indent = " "*(26 + routine.Length() + len(template))
-			result += routine.RoutineHeaderC(flavour, 20, "")+" {\n"
-			result += "  auto status = clblast::"+routine.name.capitalize()+template+"("
-			result += (",\n"+indent).join([a for a in routine.ArgumentsCast(flavour, indent)])
-			result += ",\n"+indent+"queue, event);"
-			result += "\n  return static_cast<StatusCode>(status);\n}\n"
-	return result
-
-# ==================================================================================================
-
-# The wrapper to the reference clBLAS routines (for performance/correctness testing)
-def wrapper_clblas(routines):
-	result = ""
-	for routine in routines:
-		if routine.has_tests:
-			result += "\n// Forwards the clBLAS calls for %s\n" % (routine.ShortNamesTested())
-			if routine.NoScalars():
-				result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n"
-			for flavour in routine.flavours:
-				result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n"
-
-				# There is a version available in clBLAS
-				if flavour.precision_name in ["S","D","C","Z"]:
-					indent = " "*(17 + routine.Length())
-					arguments = routine.ArgumentsWrapperCL(flavour)
-					if routine.scratch:
-						result += "  auto queue = Queue(queues[0]);\n"
-						result += "  auto context = queue.GetContext();\n"
-						result += "  auto scratch_buffer = Buffer<"+flavour.template+">(context, "+routine.scratch+");\n"
-						arguments += ["scratch_buffer()"]
-					result += "  return clblas"+flavour.name+routine.name+"("
-					result += (",\n"+indent).join([a for a in arguments])
-					result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
-
-				# There is no clBLAS available, forward the call to one of the available functions
-				else: # Half-precision
-					indent = " "*(24 + routine.Length())
-
-					# Convert to float (note: also integer buffers are stored as half/float)
-					for buf in routine.inputs + routine.outputs:
-						result += "  auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n"
-
-					# Call the float routine
-					result += "  auto status = clblasX"+routine.name+"("
-					result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
-					result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);"
-					result += "\n"
-
-					# Convert back to half
-					for buf in routine.outputs:
-						result += "  FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n"
-					result += "  return status;"
-
-				# Complete
-				result += "\n}\n"
-	return result
-
-# The wrapper to the reference CBLAS routines (for performance/correctness testing)
-def wrapper_cblas(routines):
-	result = ""
-	for routine in routines:
-		if routine.has_tests:
-			result += "\n// Forwards the Netlib BLAS calls for %s\n" % (routine.ShortNamesTested())
-			for flavour in routine.flavours:
-				result += routine.RoutineHeaderWrapperC(flavour, False, 12)+" {\n"
-
-				# There is a version available in CBLAS
-				if flavour.precision_name in ["S","D","C","Z"]:
-					indent = " "*(10 + routine.Length())
-					arguments = routine.ArgumentsWrapperC(flavour)
 
-					# Complex scalars
-					for scalar in routine.scalars:
-						if flavour.IsComplex(scalar):
-							result += "  const auto "+scalar+"_array = std::vector<"+flavour.buffertype[:-1]+">{"+scalar+".real(), "+scalar+".imag()};\n"
-
-					# Special case for scalar outputs
-					assignment = ""
-					postfix = ""
-					endofline = ""
-					extra_argument = ""
-					for output_buffer in routine.outputs:
-						if output_buffer in routine.ScalarBuffersFirst():
-							if flavour in [C,Z]:
-								postfix += "_sub"
-								indent += "    "
-								extra_argument += ",\n"+indent+"reinterpret_cast<return_pointer_"+flavour.buffertype[:-1]+">(&"+output_buffer+"_buffer["+output_buffer+"_offset])"
-							elif output_buffer in routine.IndexBuffers():
-								assignment = "((int*)&"+output_buffer+"_buffer[0])["+output_buffer+"_offset] = "
-								indent += " "*len(assignment)
-							else:
-								assignment = output_buffer+"_buffer["+output_buffer+"_offset]"
-								if (flavour.name in ["Sc","Dz"]):
-									assignment = assignment+".real("
-									endofline += ")"
-								else:
-									assignment = assignment+" = "
-								indent += " "*len(assignment)
-
-					result += "  "+assignment+"cblas_"+flavour.name.lower()+routine.name+postfix+"("
-					result += (",\n"+indent).join([a for a in arguments])
-					result += extra_argument+endofline+");\n"
-
-				# There is no CBLAS available, forward the call to one of the available functions
-				else: # Half-precision
-					indent = " "*(9 + routine.Length())
-
-					# Convert to float (note: also integer buffers are stored as half/float)
-					for buf in routine.inputs + routine.outputs:
-						result += "  auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer);\n"
-
-					# Call the float routine
-					result += "  cblasX"+routine.name+"("
-					result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()])
-					result += ");\n"
-
-					# Convert back to half
-					for buf in routine.outputs:
-						result += "  FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis);\n"
-
-				# Complete
-				result += "}\n"
-	return result
-
-# ==================================================================================================
-
-# Checks for the number of command-line arguments
-if len(sys.argv) != 2:
-	print "[ERROR] Usage: generator.py <root_of_clblast>"
-	sys.exit()
-
-# Parses the command-line arguments
-path_clblast = sys.argv[1]
-files = [
-  path_clblast+"/include/clblast.h",
-  path_clblast+"/src/clblast.cpp",
-  path_clblast+"/include/clblast_c.h",
-  path_clblast+"/src/clblast_c.cpp",
-  path_clblast+"/test/wrapper_clblas.hpp",
-  path_clblast+"/test/wrapper_cblas.hpp",
-]
-header_lines = [96, 73, 97, 22, 29, 41]
-footer_lines = [17, 75, 19, 14, 6, 6]
-
-# Checks whether the command-line arguments are valid; exists otherwise
-for f in files:
-	if not os.path.isfile(f):
-		print "[ERROR] The path '"+path_clblast+"' does not point to the root of the CLBlast library"
-		sys.exit()
-
-# ==================================================================================================
-
-# Iterates over all files to output
-for i in xrange(0,len(files)):
-
-	# Stores the header and the footer of the original file
-	with open(files[i]) as f:
-		original = f.readlines()
-	file_header = original[:header_lines[i]]
-	file_footer = original[-footer_lines[i]:]
-
-	# Re-writes the body of the file
-	with open(files[i], "w") as f:
-		body = ""
-		levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4]
-		for level in levels:
-			body += separators[level-1]+"\n"
-			if i == 0:
-				body += clblast_h(routines[level-1])
-			if i == 1:
-				body += clblast_cc(routines[level-1])
-			if i == 2:
-				body += clblast_c_h(routines[level-1])
-			if i == 3:
-				body += clblast_c_cc(routines[level-1])
-			if i == 4:
-				body += wrapper_clblas(routines[level-1])
-			if i == 5:
-				body += wrapper_cblas(routines[level-1])
-		f.write("".join(file_header))
-		f.write(body)
-		f.write("".join(file_footer))
-
-# ==================================================================================================
-
-# Outputs all the correctness-test implementations
-for level in [1,2,3,4]:
-	for routine in routines[level-1]:
-		if routine.has_tests:
-			filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
-			with open(filename, "w") as f:
-				body = ""
-				body += "#include \"test/correctness/testblas.hpp\"\n"
-				body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
-				body += "// Shortcuts to the clblast namespace\n"
-				body += "using float2 = clblast::float2;\n"
-				body += "using double2 = clblast::double2;\n\n"
-				body += "// Main function (not within the clblast namespace)\n"
-				body += "int main(int argc, char *argv[]) {\n"
-				body += "  auto errors = size_t{0};\n"
-				not_first = "false"
-				for flavour in routine.flavours:
-					body += "  errors += clblast::RunTests<clblast::TestX"+routine.name+flavour.TestTemplate()
-					body += ">(argc, argv, "+not_first+", \""+flavour.name+routine.name.upper()+"\");\n"
-					not_first = "true"
-				body += "  if (errors > 0) { return 1; } else { return 0; }\n"
-				body += "}\n"
-				f.write(header+"\n")
-				f.write(body)
-				f.write(footer)
-
-# Outputs all the performance-test implementations
-for level in [1,2,3,4]:
-	for routine in routines[level-1]:
-		if routine.has_tests:
-			filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cpp"
-			with open(filename, "w") as f:
-				body = ""
-				body += "#include \"test/performance/client.hpp\"\n"
-				body += "#include \"test/routines/level"+levelnames[level-1]+"/x"+routine.name+".hpp\"\n\n"
-				body += "// Shortcuts to the clblast namespace\n"
-				body += "using float2 = clblast::float2;\n"
-				body += "using double2 = clblast::double2;\n\n"
-				body += "// Main function (not within the clblast namespace)\n"
-				body += "int main(int argc, char *argv[]) {\n"
-				default = PrecisionToFullName(routine.flavours[0].precision_name)
-				body += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k"+default+")) {\n"
-				for precision in ["H","S","D","C","Z"]:
-					body += "    case clblast::Precision::k"+PrecisionToFullName(precision)+":"
-					found = False
-					for flavour in routine.flavours:
-						if flavour.precision_name == precision:
-							body += "\n      clblast::RunClient<clblast::TestX"+routine.name+flavour.TestTemplate()
-							body += ">(argc, argv); break;\n"
-							found = True
-					if not found:
-						body += " throw std::runtime_error(\"Unsupported precision mode\");\n"
-				body += "  }\n"
-				body += "  return 0;\n"
-				body += "}\n"
-				f.write(header+"\n")
-				f.write(body)
-				f.write(footer)
-
-# ==================================================================================================
-
-# Outputs the API documentation
-filename = path_clblast+"/doc/clblast.md"
-with open(filename, "w") as f:
-
-	# Outputs the header
-	f.write("CLBlast: API reference\n")
-	f.write("================\n")
-	f.write("\n\n")
-
-	# Loops over the routines
-	for level in [1,2,3,4]:
-		for routine in routines[level-1]:
-			if routine.implemented:
-
-				# Routine header
-				f.write("x"+routine.name.upper()+": "+routine.description+"\n")
-				f.write("-------------\n")
-				f.write("\n")
-				f.write(routine.details+"\n")
-				f.write("\n")
-
-				# Routine API
-				f.write("C++ API:\n")
-				f.write("```\n")
-				f.write(routine.RoutineHeaderCPP(12, "")+"\n")
-				f.write("```\n")
-				f.write("\n")
-				f.write("C API:\n")
-				f.write("```\n")
-				for flavour in routine.flavours:
-					f.write(routine.RoutineHeaderC(flavour, 20, "")+"\n")
-				f.write("```\n")
-				f.write("\n")
-
-				# Routine arguments
-				f.write("Arguments to "+routine.name.upper()+":\n")
-				f.write("\n")
-				for argument in routine.ArgumentsDoc():
-					f.write("* "+argument+"\n")
-				f.write("* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.\n")
-				f.write("* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.\n")
-				f.write("\n")
-
-				# Routine requirements
-				if len(routine.RequirementsDoc()) > 0:
-					f.write("Requirements for "+routine.name.upper()+":\n")
-					f.write("\n")
-					for requirement in routine.RequirementsDoc():
-						f.write("* "+requirement+"\n")
-					f.write("\n")
-
-				# Routine footer
-				f.write("\n\n")
-
-
-# ==================================================================================================
+def main(argv):
+
+    # Parses the command-line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("clblast_root", help="Root of the CLBlast sources")
+    parser.add_argument("-v", "--verbose", action="store_true", help="Increase verbosity of the script")
+    cl_args = parser.parse_args(argv)
+    library_root = cl_args.clblast_root
+
+    # Sets all the files the output
+    files = [
+        library_root + "/include/clblast.h",
+        library_root + "/src/clblast.cpp",
+        library_root + "/include/clblast_c.h",
+        library_root + "/src/clblast_c.cpp",
+        library_root + "/test/wrapper_clblas.hpp",
+        library_root + "/test/wrapper_cblas.hpp",
+    ]
+
+    # Checks whether the command-line arguments are valid; exists otherwise
+    for f in files:
+        if not os.path.isfile(f):
+            print("[ERROR] The path '" + library_root + "' does not point to the root of the CLBlast library")
+            sys.exit()
+
+    # Iterates over all regular files to output
+    for i in range(0, len(files)):
+
+        # Stores the header and the footer of the original file
+        with open(files[i]) as f:
+            original = f.readlines()
+        file_header = original[:HEADER_LINES[i]]
+        file_footer = original[-FOOTER_LINES[i]:]
+
+        # Re-writes the body of the file
+        with open(files[i], "w") as f:
+            body = ""
+            levels = [1, 2, 3] if (i == 4 or i == 5) else [1, 2, 3, 4]
+            for level in levels:
+                body += cpp.LEVEL_SEPARATORS[level - 1] + "\n"
+                for routine in ROUTINES[level - 1]:
+                    if i == 0:
+                        body += cpp.clblast_h(routine)
+                    if i == 1:
+                        body += cpp.clblast_cc(routine)
+                    if i == 2:
+                        body += cpp.clblast_c_h(routine)
+                    if i == 3:
+                        body += cpp.clblast_c_cc(routine)
+                    if i == 4:
+                        body += cpp.wrapper_clblas(routine)
+                    if i == 5:
+                        body += cpp.wrapper_cblas(routine)
+            f.write("".join(file_header))
+            f.write(body)
+            f.write("".join(file_footer))
+
+    # Outputs all the test implementations
+    for level in [1, 2, 3, 4]:
+        for routine in ROUTINES[level - 1]:
+            if routine.has_tests:
+                level_string = cpp.LEVEL_NAMES[level - 1]
+                routine_suffix = "level" + level_string + "/x" + routine.name + ".cpp"
+
+                # Correctness tests
+                filename = library_root + "/test/correctness/routines/" + routine_suffix
+                with open(filename, "w") as f:
+                    f.write(cpp.HEADER + "\n")
+                    f.write(cpp.correctness_test(routine, level_string))
+                    f.write(cpp.FOOTER)
+
+                # Performance tests
+                filename = library_root + "/test/performance/routines/" + routine_suffix
+                with open(filename, "w") as f:
+                    f.write(cpp.HEADER + "\n")
+                    f.write(cpp.performance_test(routine, level_string))
+                    f.write(cpp.FOOTER)
+
+    # Outputs the API documentation
+    filename = cl_args.clblast_root + "/doc/clblast.md"
+    with open(filename, "w") as f:
+
+        # Outputs the header
+        doc_header = doc.header()
+        f.write(doc_header)
+
+        # Generates the documentation for each routine
+        for level in [1, 2, 3, 4]:
+            for routine in ROUTINES[level - 1]:
+                if routine.implemented:
+                    doc_routine = doc.generate(routine)
+                    f.write(doc_routine)
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
diff --git a/scripts/generator/generator/__init__.py b/scripts/generator/generator/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/generator/generator/convert.py b/scripts/generator/generator/convert.py
new file mode 100644
index 00000000..c0309ec3
--- /dev/null
+++ b/scripts/generator/generator/convert.py
@@ -0,0 +1,69 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+
+def precision_to_full_name(x):
+    """Translates an option name to a CLBlast data-type"""
+    return {
+        'H': "Half",
+        'S': "Single",
+        'D': "Double",
+        'C': "ComplexSingle",
+        'Z': "ComplexDouble",
+    }[x]
+
+
+def option_to_clblast(x):
+    """Translates an option name to a CLBlast data-type"""
+    return {
+        'layout': "Layout",
+        'a_transpose': "Transpose",
+        'b_transpose': "Transpose",
+        'ab_transpose': "Transpose",
+        'side': "Side",
+        'triangle': "Triangle",
+        'diagonal': "Diagonal",
+    }[x]
+
+
+def option_to_clblas(x):
+    """As above, but for clBLAS data-types"""
+    return {
+        'layout': "clblasOrder",
+        'a_transpose': "clblasTranspose",
+        'b_transpose': "clblasTranspose",
+        'ab_transpose': "clblasTranspose",
+        'side': "clblasSide",
+        'triangle': "clblasUplo",
+        'diagonal': "clblasDiag",
+    }[x]
+
+
+def option_to_cblas(x):
+    """As above, but for CBLAS data-types"""
+    return {
+        'layout': "CBLAS_ORDER",
+        'a_transpose': "CBLAS_TRANSPOSE",
+        'b_transpose': "CBLAS_TRANSPOSE",
+        'ab_transpose': "CBLAS_TRANSPOSE",
+        'side': "CBLAS_SIDE",
+        'triangle': "CBLAS_UPLO",
+        'diagonal': "CBLAS_DIAG",
+    }[x]
+
+
+def option_to_documentation(x):
+    """Translates an option name to a documentation string"""
+    return {
+        'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
+        'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
+        'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
+        'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
+        'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).",
+        'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
+        'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.",
+    }[x]
diff --git a/scripts/generator/generator/cpp.py b/scripts/generator/generator/cpp.py
new file mode 100644
index 00000000..427eb180
--- /dev/null
+++ b/scripts/generator/generator/cpp.py
@@ -0,0 +1,257 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import generator.datatype as datatype
+import generator.convert as convert
+
+
+NL = "\n"
+SEPARATOR = "// ================================================================================================="
+
+# Separators for the BLAS levels
+LEVEL_SEPARATORS = [
+    NL + SEPARATOR + NL + "// BLAS level-1 (vector-vector) routines" + NL + SEPARATOR,
+    NL + SEPARATOR + NL + "// BLAS level-2 (matrix-vector) routines" + NL + SEPARATOR,
+    NL + SEPARATOR + NL + "// BLAS level-3 (matrix-matrix) routines" + NL + SEPARATOR,
+    NL + SEPARATOR + NL + "// Extra non-BLAS routines (level-X)" + NL + SEPARATOR
+]
+
+# Names of the level sub-folders
+LEVEL_NAMES = ["1", "2", "3", "x"]
+
+# Main header/footer for source files
+FOOTER = NL + SEPARATOR + NL
+HEADER = NL + SEPARATOR + """
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+""" + SEPARATOR + NL
+
+
+def clblast_h(routine):
+    """The C++ API header (.h)"""
+    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
+    result += routine.routine_header_cpp(12, " = nullptr") + ";" + NL
+    return result
+
+
+def clblast_cc(routine):
+    """The C++ API implementation (.cpp)"""
+    indent1 = " " * (20 + routine.length())
+    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
+    if routine.implemented:
+        result += routine.routine_header_cpp(12, "") + " {" + NL
+        result += "  auto queue_cpp = Queue(*queue);" + NL
+        result += "  auto routine = X" + routine.name + "<" + routine.template.template + ">(queue_cpp, event);" + NL
+        result += "  auto status = routine.SetUp();" + NL
+        result += "  if (status != StatusCode::kSuccess) { return status; }" + NL
+        result += "  return routine.Do" + routine.name.capitalize() + "("
+        result += ("," + NL + indent1).join([a for a in routine.arguments_clcudaapi()])
+        result += ");" + NL
+    else:
+        result += routine.routine_header_type_cpp(12) + " {" + NL
+        result += "  return StatusCode::kNotImplemented;" + NL
+    result += "}" + NL
+    for flavour in routine.flavours:
+        indent2 = " " * (34 + routine.length() + len(flavour.template))
+        result += "template StatusCode PUBLIC_API " + routine.name.capitalize() + "<" + flavour.template + ">("
+        result += ("," + NL + indent2).join([a for a in routine.arguments_type(flavour)])
+        result += "," + NL + indent2 + "cl_command_queue*, cl_event*);" + NL
+    return result
+
+
+def clblast_c_h(routine):
+    """The C API header (.h)"""
+    result = NL + "// " + routine.description + ": " + routine.short_names() + NL
+    for flavour in routine.flavours:
+        result += routine.routine_header_c(flavour, 31, " PUBLIC_API") + ";" + NL
+    return result
+
+
+def clblast_c_cc(routine):
+    """The C API implementation (.cpp)"""
+    result = NL + "// " + routine.name.upper() + NL
+    for flavour in routine.flavours:
+        template = "<" + flavour.template + ">" if routine.no_scalars() else ""
+        indent = " " * (26 + routine.length() + len(template))
+        result += routine.routine_header_c(flavour, 20, "") + " {" + NL
+        result += "  auto status = clblast::" + routine.name.capitalize() + template + "("
+        result += ("," + NL + indent).join([a for a in routine.arguments_cast(flavour, indent)])
+        result += "," + NL + indent + "queue, event);"
+        result += NL + "  return static_cast<StatusCode>(status);" + NL + "}" + NL
+    return result
+
+
+def wrapper_clblas(routine):
+    """The wrapper to the reference clBLAS routines (for performance/correctness testing)"""
+    result = ""
+    if routine.has_tests:
+        result += NL + "// Forwards the clBLAS calls for %s" % routine.short_names_tested() + NL
+        if routine.no_scalars():
+            result += routine.routine_header_wrapper_clblas(routine.template, True, 21) + ";" + NL
+        for flavour in routine.flavours:
+            result += routine.routine_header_wrapper_clblas(flavour, False, 21) + " {" + NL
+
+            # There is a version available in clBLAS
+            if flavour.precision_name in ["S", "D", "C", "Z"]:
+                indent = " " * (17 + routine.length())
+                arguments = routine.arguments_wrapper_clblas(flavour)
+                if routine.scratch:
+                    result += "  auto queue = Queue(queues[0]);" + NL
+                    result += "  auto context = queue.GetContext();" + NL
+                    result += "  auto scratch_buffer = Buffer<" + flavour.template + ">"
+                    result += "(context, " + routine.scratch + ");" + NL
+                    arguments += ["scratch_buffer()"]
+                result += "  return clblas" + flavour.name + routine.name + "("
+                result += ("," + NL + indent).join([a for a in arguments])
+                result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);"
+
+            # There is no clBLAS available, forward the call to one of the available functions
+            else:  # Half-precision
+                indent = " " * (24 + routine.length())
+
+                # Convert to float (note: also integer buffers are stored as half/float)
+                for buf in routine.inputs + routine.outputs:
+                    result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer, queues[0]);" + NL
+
+                # Call the float routine
+                result += "  auto status = clblasX" + routine.name + "("
+                result += ("," + NL + indent).join([a for a in routine.arguments_half()])
+                result += "," + NL + indent + "num_queues, queues, num_wait_events, wait_events, events);"
+                result += NL
+
+                # Convert back to half
+                for buf in routine.outputs:
+                    result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis, queues[0]);" + NL
+                result += "  return status;"
+
+            # Complete
+            result += NL + "}" + NL
+    return result
+
+
+def wrapper_cblas(routine):
+    """The wrapper to the reference CBLAS routines (for performance/correctness testing)"""
+    result = ""
+    if routine.has_tests:
+        result += NL + "// Forwards the Netlib BLAS calls for %s" % routine.short_names_tested() + NL
+        for flavour in routine.flavours:
+            result += routine.routine_header_wrapper_cblas(flavour, 12) + " {" + NL
+
+            # There is a version available in CBLAS
+            if flavour.precision_name in ["S", "D", "C", "Z"]:
+                indent = " " * (10 + routine.length())
+                arguments = routine.arguments_wrapper_cblas(flavour)
+
+                # Complex scalars
+                for scalar in routine.scalars:
+                    if flavour.is_complex(scalar):
+                        result += "  const auto " + scalar + "_array = std::vector<" + flavour.buffer_type[:-1] + ">"
+                        result += "{" + scalar + ".real(), " + scalar + ".imag()};" + NL
+
+                # Special case for scalar outputs
+                assignment = ""
+                postfix = ""
+                end_of_line = ""
+                extra_argument = ""
+                for output_buffer in routine.outputs:
+                    if output_buffer in routine.scalar_buffers_first():
+                        if flavour in [datatype.C, datatype.Z]:
+                            postfix += "_sub"
+                            indent += "    "
+                            extra_argument += "," + NL + indent
+                            extra_argument += "reinterpret_cast<return_pointer_" + flavour.buffer_type[:-1] + ">"
+                            extra_argument += "(&" + output_buffer + "_buffer[" + output_buffer + "_offset])"
+                        elif output_buffer in routine.index_buffers():
+                            assignment = "((int*)&" + output_buffer + "_buffer[0])[" + output_buffer + "_offset] = "
+                            indent += " " * len(assignment)
+                        else:
+                            assignment = output_buffer + "_buffer[" + output_buffer + "_offset]"
+                            if flavour.name in ["Sc", "Dz"]:
+                                assignment += ".real("
+                                end_of_line += ")"
+                            else:
+                                assignment += " = "
+                            indent += " " * len(assignment)
+
+                result += "  " + assignment + "cblas_" + flavour.name.lower() + routine.name + postfix + "("
+                result += ("," + NL + indent).join([a for a in arguments])
+                result += extra_argument + end_of_line + ");" + NL
+
+            # There is no CBLAS available, forward the call to one of the available functions
+            else:  # Half-precision
+                indent = " " * (9 + routine.length())
+
+                # Convert to float (note: also integer buffers are stored as half/float)
+                for buf in routine.inputs + routine.outputs:
+                    result += "  auto " + buf + "_buffer_bis = HalfToFloatBuffer(" + buf + "_buffer);" + NL
+
+                # Call the float routine
+                result += "  cblasX" + routine.name + "("
+                result += ("," + NL + indent).join([a for a in routine.arguments_half()])
+                result += ");" + NL
+
+                # Convert back to half
+                for buf in routine.outputs:
+                    result += "  FloatToHalfBuffer(" + buf + "_buffer, " + buf + "_buffer_bis);" + NL
+
+            # Complete
+            result += "}" + NL
+    return result
+
+
+def performance_test(routine, level_string):
+    """Generates the body of a performance test for a specific routine"""
+    result = ""
+    result += "#include \"test/performance/client.hpp\"" + NL
+    result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
+    result += "// Shortcuts to the clblast namespace" + NL
+    result += "using float2 = clblast::float2;" + NL
+    result += "using double2 = clblast::double2;" + NL + NL
+    result += "// Main function (not within the clblast namespace)" + NL
+    result += "int main(int argc, char *argv[]) {" + NL
+    default = convert.precision_to_full_name(routine.flavours[0].precision_name)
+    result += "  switch(clblast::GetPrecision(argc, argv, clblast::Precision::k" + default + ")) {" + NL
+    for precision in ["H", "S", "D", "C", "Z"]:
+        result += "    case clblast::Precision::k" + convert.precision_to_full_name(precision) + ":"
+        found = False
+        for flavour in routine.flavours:
+            if flavour.precision_name == precision:
+                result += NL + "      clblast::RunClient<clblast::TestX" + routine.name + flavour.test_template()
+                result += ">(argc, argv); break;" + NL
+                found = True
+        if not found:
+            result += " throw std::runtime_error(\"Unsupported precision mode\");" + NL
+    result += "  }" + NL
+    result += "  return 0;" + NL
+    result += "}" + NL
+    return result
+
+
+def correctness_test(routine, level_string):
+    """Generates the body of a correctness test for a specific routine"""
+    result = ""
+    result += "#include \"test/correctness/testblas.hpp\"" + NL
+    result += "#include \"test/routines/level" + level_string + "/x" + routine.name + ".hpp\"" + NL + NL
+    result += "// Shortcuts to the clblast namespace" + NL
+    result += "using float2 = clblast::float2;" + NL
+    result += "using double2 = clblast::double2;" + NL + NL
+    result += "// Main function (not within the clblast namespace)" + NL
+    result += "int main(int argc, char *argv[]) {" + NL
+    result += "  auto errors = size_t{0};" + NL
+    not_first = "false"
+    for flavour in routine.flavours:
+        result += "  errors += clblast::RunTests<clblast::TestX" + routine.name + flavour.test_template()
+        result += ">(argc, argv, " + not_first + ", \"" + flavour.name + routine.name.upper() + "\");" + NL
+        not_first = "true"
+    result += "  if (errors > 0) { return 1; } else { return 0; }" + NL
+    result += "}" + NL
+    return result
diff --git a/scripts/generator/generator/datatype.py b/scripts/generator/generator/datatype.py
new file mode 100644
index 00000000..9a6c6c02
--- /dev/null
+++ b/scripts/generator/generator/datatype.py
@@ -0,0 +1,92 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+
+# Short-hands for data-types
+D_HALF = "half"
+D_FLOAT = "float"
+D_DOUBLE = "double"
+D_FLOAT2 = "float2"
+D_DOUBLE2 = "double2"
+D_HALF_OPENCL = "cl_half"
+D_FLOAT2_OPENCL = "cl_float2"
+D_DOUBLE2_OPENCL = "cl_double2"
+
+
+class DataType:
+    """Class holding data-type and precision information"""
+
+    def __init__(self, precision_name, name, template, scalars, buffer_type):
+        self.precision_name = precision_name
+        self.name = name
+        self.template = template
+        self.alpha_cpp = scalars[0]
+        self.beta_cpp = scalars[1]
+        self.alpha_cl = scalars[2]
+        self.beta_cl = scalars[3]
+        self.buffer_type = buffer_type
+
+    def use_alpha(self):
+        """Outputs the name of the data-type (alpha/beta), possibly transforming into the right type"""
+        if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
+            return self.alpha_cpp + "{alpha.s[0], alpha.s[1]}"
+        return "alpha"
+
+    def use_beta(self):
+        """As above, but for beta instead of alpha"""
+        if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
+            return self.beta_cpp + "{beta.s[0], beta.s[1]}"
+        return "beta"
+
+    def use_alpha_opencl(self):
+        """As above, but the transformation is in the opposite direction"""
+        if self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]:
+            return self.alpha_cl + "{{alpha.real(), alpha.imag()}}"
+        return "alpha"
+
+    def use_beta_opencl(self):
+        """As above, but for beta instead of alpha"""
+        if self.beta_cpp in [D_FLOAT2, D_DOUBLE2]:
+            return self.beta_cl + "{{beta.real(), beta.imag()}}"
+        return "beta"
+
+    def test_template(self):
+        """Returns the template as used in the correctness/performance tests"""
+        if self.buffer_type != self.beta_cpp:
+            return "<" + self.buffer_type + "," + self.beta_cpp + ">, " + self.buffer_type + ", " + self.beta_cpp
+        return "<" + self.buffer_type + ">, " + self.buffer_type + ", " + self.beta_cpp
+
+    def is_complex(self, scalar):
+        """Current scalar is complex"""
+        return ((scalar == "alpha" and self.alpha_cpp in [D_FLOAT2, D_DOUBLE2]) or
+                (scalar == "beta" and self.beta_cpp in [D_FLOAT2, D_DOUBLE2]))
+
+
+# Regular data-types
+H = DataType("H", "H", D_HALF, [D_HALF] * 2 + [D_HALF_OPENCL] * 2, D_HALF)  # half (16)
+S = DataType("S", "S", D_FLOAT, [D_FLOAT] * 4, D_FLOAT)  # single (32)
+D = DataType("D", "D", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE)  # double (64)
+C = DataType("C", "C", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2)  # single-complex (3232)
+Z = DataType("Z", "Z", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2)  # double-complex (6464)
+
+# Special cases
+Sc = DataType("C", "Sc", D_FLOAT2, [D_FLOAT2] * 4, D_FLOAT2)  # As C, but with real output
+Dz = DataType("Z", "Dz", D_DOUBLE2, [D_DOUBLE2] * 4, D_DOUBLE2)  # As Z, but with real output
+iH = DataType("H", "iH", D_HALF, [D_HALF] * 4, D_HALF)  # As H, but with integer output
+iS = DataType("S", "iS", D_FLOAT, [D_FLOAT] * 4, D_FLOAT)  # As S, but with integer output
+iD = DataType("D", "iD", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE)  # As D, but with integer output
+iC = DataType("C", "iC", D_FLOAT2, [D_FLOAT2] * 2 + [D_FLOAT2_OPENCL] * 2, D_FLOAT2)  # As C, but with integer output
+iZ = DataType("Z", "iZ", D_DOUBLE2, [D_DOUBLE2] * 2 + [D_DOUBLE2_OPENCL] * 2, D_DOUBLE2)  # As Z, but with int output
+Css = DataType("C", "C", D_FLOAT, [D_FLOAT, D_FLOAT, D_FLOAT, D_FLOAT], D_FLOAT2)  # As C, but with constants from S
+Zdd = DataType("Z", "Z", D_DOUBLE, [D_DOUBLE] * 4, D_DOUBLE2)  # As Z, but with constants from D
+Ccs = DataType("C", "C", D_FLOAT2 + "," + D_FLOAT, [D_FLOAT2, D_FLOAT, D_FLOAT2_OPENCL, D_FLOAT], D_FLOAT2)  # As C, but with one constant from S
+Zzd = DataType("Z", "Z", D_DOUBLE2 + "," + D_DOUBLE, [D_DOUBLE2, D_DOUBLE, D_DOUBLE2_OPENCL, D_DOUBLE], D_DOUBLE2)  # As Z, but with one constant from D
+
+# C++ template data-types
+T = DataType("T", "typename T", "T", ["T", "T", "T", "T"], "T")  # regular routine
+Tc = DataType("Tc", "typename T", "std::complex<T>,T", ["T", "T", "T", "T"], "std::complex<T>")  # for herk
+TU = DataType("TU", "typename T, typename U", "T,U", ["T", "U", "T", "U"], "T")  # for her2k
diff --git a/scripts/generator/generator/doc.py b/scripts/generator/generator/doc.py
new file mode 100644
index 00000000..8657ed0d
--- /dev/null
+++ b/scripts/generator/generator/doc.py
@@ -0,0 +1,57 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+NL = "\n"
+
+
+def header():
+    """Generates the header for the API documentation"""
+    result = "CLBlast: API reference" + NL
+    result += "================" + NL + NL + NL
+    return result
+
+
+def generate(routine):
+    """Generates the API documentation for a given routine"""
+    result = ""
+
+    # Routine header
+    result += "x" + routine.name.upper() + ": " + routine.description + NL
+    result += "-------------" + NL + NL
+    result += routine.details + NL + NL
+
+    # Routine API
+    result += "C++ API:" + NL
+    result += "```" + NL
+    result += routine.routine_header_cpp(12, "") + NL
+    result += "```" + NL + NL
+    result += "C API:" + NL
+    result += "```" + NL
+    for flavour in routine.flavours:
+        result += routine.routine_header_c(flavour, 20, "") + NL
+    result += "```" + NL + NL
+
+    # Routine arguments
+    result += "Arguments to " + routine.name.upper() + ":" + NL + NL
+    for argument in routine.arguments_doc():
+        result += "* " + argument + NL
+    result += "* `cl_command_queue* queue`: "
+    result += "Pointer to an OpenCL command queue associated with a context and device to execute the routine on." + NL
+    result += "* `cl_event* event`: "
+    result += "Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). "
+    result += "This is an optional argument." + NL + NL
+
+    # Routine requirements
+    if len(routine.requirements_doc()) > 0:
+        result += "Requirements for " + routine.name.upper() + ":" + NL + NL
+        for requirement in routine.requirements_doc():
+            result += "* " + requirement + NL
+        result += NL
+
+    # Routine footer
+    result += NL + NL
+    return result
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
new file mode 100644
index 00000000..a4e682c2
--- /dev/null
+++ b/scripts/generator/generator/routine.py
@@ -0,0 +1,552 @@
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This file follows the
+# PEP8 Python style guide and uses a max-width of 120 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+from itertools import chain
+
+import generator.convert as convert
+
+
+class Routine:
+    """Class holding routine-specific information (e.g. name, which arguments, which precisions)"""
+    def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
+                 inputs, outputs, scalars, scratch, description, details, requirements):
+        self.implemented = implemented
+        self.has_tests = has_tests
+        self.level = level
+        self.name = name
+        self.template = template
+        self.flavours = flavours
+        self.sizes = sizes
+        self.options = options
+        self.inputs = inputs
+        self.outputs = outputs
+        self.scalars = scalars
+        self.scratch = scratch  # Scratch buffer (e.g. for xDOT)
+        self.description = description
+        self.details = details
+        self.requirements = requirements
+
+    @staticmethod
+    def scalar_buffers_first():
+        """List of scalar buffers"""
+        return ["dot", "nrm2", "asum", "sum", "imax", "imin"]
+
+    @staticmethod
+    def scalar_buffers_second():
+        """List of scalar buffers"""
+        return ["sa", "sb", "sc", "ss", "sd1", "sd2", "sx1", "sy1", "sparam"]
+
+    @staticmethod
+    def other_scalars():
+        """List of scalars other than alpha and beta"""
+        return ["cos", "sin"]
+
+    @staticmethod
+    def index_buffers():
+        """List of buffers with unsigned int type"""
+        return ["imax", "imin"]
+
+    @staticmethod
+    def postfix(name):
+        """Retrieves the postfix for a buffer"""
+        return "inc" if (name in ["x", "y"]) else "ld"
+
+    @staticmethod
+    def buffers_vector():
+        """Distinguish between vectors and matrices"""
+        return ["x", "y"]
+
+    @staticmethod
+    def buffers_matrix():
+        """Distinguish between vectors and matrices"""
+        return ["a", "b", "c", "ap"]
+
+    def non_index_inputs(self):
+        """Lists of input/output buffers not index (integer)"""
+        buffers = self.inputs[:]  # make a copy
+        for i in self.index_buffers():
+            if i in buffers:
+                buffers.remove(i)
+        return buffers
+
+    def non_index_outputs(self):
+        """Lists of input/output buffers not index (integer)"""
+        buffers = self.outputs[:]  # make a copy
+        for i in self.index_buffers():
+            if i in buffers:
+                buffers.remove(i)
+        return buffers
+
+    def buffers_without_ld_inc(self):
+        """List of buffers without 'inc' or 'ld'"""
+        return self.scalar_buffers_first() + self.scalar_buffers_second() + ["ap"]
+
+    def length(self):
+        """Retrieves the number of characters in the routine's name"""
+        return len(self.name)
+
+    def no_scalars(self):
+        """Determines whether or not this routine has scalar arguments (alpha/beta)"""
+        return self.scalars == []
+
+    def short_names(self):
+        """Returns the upper-case names of these routines (all flavours)"""
+        return "/".join([f.name + self.name.upper() for f in self.flavours])
+
+    def short_names_tested(self):
+        """As above, but excludes some"""
+        names = [f.name + self.name.upper() for f in self.flavours]
+        if "H" + self.name.upper() in names:
+            names.remove("H" + self.name.upper())
+        return "/".join(names)
+
+    def buffers_first(self):
+        """Determines which buffers go first (between alpha and beta) and which ones go after"""
+        if self.level == "2b":
+            return ["x", "y"]
+        return ["ap", "a", "b", "x"]
+
+    def buffers_second(self):
+        if self.level == "2b":
+            return ["ap", "a", "b", "c"]
+        return ["y", "c"]
+
+    def buffer(self, name):
+        """Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')"""
+        if name in self.inputs or name in self.outputs:
+            a = [name + "_buffer"]
+            b = [name + "_offset"]
+            c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_bis(self, name):
+        """As above but with a '_bis' suffix for the buffer name"""
+        if name in self.inputs or name in self.outputs:
+            a = [name + "_buffer_bis"]
+            b = [name + "_offset"]
+            c = [name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_def(self, name):
+        """As above but with data-types"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            a = [prefix + "cl_mem " + name + "_buffer"]
+            b = ["const size_t " + name + "_offset"]
+            c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_def_wrapper_cl(self, name, flavour):
+        """As above but with data-types"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            a = [prefix + "Buffer<" + flavour.buffer_type + ">& " + name + "_buffer"]
+            b = ["const size_t " + name + "_offset"]
+            c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_def_vector(self, name, flavour):
+        """As above but as vectors"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            a = [prefix + "std::vector<" + flavour.buffer_type + ">& " + name + "_buffer"]
+            b = ["const size_t " + name + "_offset"]
+            c = ["const size_t " + name + "_" + self.postfix(name)] if name not in self.buffers_without_ld_inc() else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_clcudaapi(self, name):
+        """As above but with CLCudaAPI buffers"""
+        if name in self.inputs or name in self.outputs:
+            buffer_type = "unsigned int" if (name in self.index_buffers()) else self.template.buffer_type
+            a = ["Buffer<" + buffer_type + ">(" + name + "_buffer)"]
+            b = [name + "_offset"]
+            c = [name + "_" + self.postfix(name)] if (name not in self.buffers_without_ld_inc()) else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_wrapper_clblas(self, name):
+        """As above but with a static cast for clBLAS wrapper"""
+        if name in self.inputs or name in self.outputs:
+            a = [name + "_buffer()"]
+            b = [name + "_offset"]
+            c = []
+            if name in ["x", "y"]:
+                c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
+            elif name in ["a", "b", "c"]:
+                c = [name + "_" + self.postfix(name)]
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_wrapper_cblas(self, name, flavour):
+        """As above but with a static cast for CBLAS wrapper"""
+        prefix = "const " if name in self.inputs else ""
+        if name in self.inputs or name in self.outputs:
+            if name == "sy1":
+                a = [name + "_buffer[" + name + "_offset]"]
+            elif flavour.precision_name in ["C", "Z"]:
+                a = ["reinterpret_cast<" + prefix + flavour.buffer_type[:-1] + "*>" +
+                     "(&" + name + "_buffer[" + name + "_offset])"]
+            else:
+                a = ["&" + name + "_buffer[" + name + "_offset]"]
+            c = []
+            if name in ["x", "y"]:
+                c = ["static_cast<int>(" + name + "_" + self.postfix(name) + ")"]
+            elif name in ["a", "b", "c"]:
+                c = [name + "_" + self.postfix(name)]
+            return [", ".join(a + c)]
+        return []
+
+    def buffer_type(self, name):
+        """As above, but only data-types"""
+        prefix = "const " if (name in self.inputs) else ""
+        if (name in self.inputs) or (name in self.outputs):
+            a = [prefix + "cl_mem"]
+            b = ["const size_t"]
+            c = ["const size_t"] if (name not in self.buffers_without_ld_inc()) else []
+            return [", ".join(a + b + c)]
+        return []
+
+    def buffer_doc(self, name):
+        """Retrieves the documentation of the buffers"""
+        prefix = "const " if (name in self.inputs) else ""
+        inout = "input" if (name in self.inputs) else "output"
+        if (name in self.inputs) or (name in self.outputs):
+            math_name = name.upper() + " matrix" if (name in self.buffers_matrix()) else name + " vector"
+            inc_ld_description = "Leading dimension " if (name in self.buffers_matrix()) else "Stride/increment "
+            a = ["`" + prefix + "cl_mem " + name + "_buffer`: OpenCL buffer to store the " + inout + " " + math_name + "."]
+            b = ["`const size_t " + name + "_offset`: The offset in elements from the start of the " + inout + " " + math_name + "."]
+            if name not in self.buffers_without_ld_inc():
+                c = ["`const size_t " + name + "_" + self.postfix(name) + "`: " +
+                     inc_ld_description + "of the " + inout + " " + math_name + ". This value must be greater than 0."]
+            else:
+                c = []
+            return a + b + c
+        return []
+
+    def scalar(self, name):
+        """Retrieves the name of a scalar (alpha/beta)"""
+        if name in self.scalars:
+            return [name]
+        return []
+
+    def scalar_half_to_float(self, name):
+        """As above, but converts from float to half"""
+        if name in self.scalars:
+            return ["HalfToFloat(" + name + ")"]
+        return []
+
+    def scalar_use(self, name, flavour):
+        """Retrieves the use of a scalar (alpha/beta)"""
+        if name in self.scalars:
+            if name == "alpha":
+                return [flavour.use_alpha()]
+            elif name == "beta":
+                return [flavour.use_beta()]
+            return [name]
+        return []
+
+    def scalar_use_wrapper(self, name, flavour):
+        """As above, but for the clBLAS wrapper"""
+        if name in self.scalars:
+            if name == "alpha":
+                return [flavour.use_alpha_opencl()]
+            elif name == "beta":
+                return [flavour.use_beta_opencl()]
+            return [name]
+        return []
+
+    def scalar_use_wrapper_cblas(self, name, flavour):
+        """As above, but for the CBLAS wrapper"""
+        if name in self.scalars:
+            if flavour.is_complex(name):
+                return [name + "_array.data()"]
+            return [name]
+        return []
+
+    def scalar_def(self, name, flavour):
+        """Retrieves the definition of a scalar (alpha/beta)"""
+        if name in self.scalars:
+            if name == "alpha":
+                return ["const " + flavour.alpha_cl + " " + name]
+            return ["const " + flavour.beta_cl + " " + name]
+        return []
+
+    def scalar_def_plain(self, name, flavour):
+        """As above, but without 'cl_' prefix"""
+        if name in self.scalars:
+            if name == "alpha":
+                return ["const " + flavour.alpha_cpp + " " + name]
+            return ["const " + flavour.beta_cpp + " " + name]
+        return []
+
+    def scalar_type(self, name, flavour):
+        """Retrieves the type of a scalar (alpha/beta)"""
+        if name in self.scalars:
+            if name == "alpha":
+                return ["const " + flavour.alpha_cpp]
+            return ["const " + flavour.beta_cpp]
+        return []
+
+    def scalar_doc(self, name):
+        """Retrieves the documentation of a scalar"""
+        if name in self.scalars:
+            if name == "alpha":
+                return ["`const " + self.template.alpha_cpp + " " + name + "`: Input scalar constant."]
+            return ["`const " + self.template.beta_cpp + " " + name + "`: Input scalar constant."]
+        return []
+
+    def sizes_list(self):
+        """Retrieves a list of comma-separated sizes (m, n, k)"""
+        if self.sizes:
+            return [", ".join([s for s in self.sizes])]
+        return []
+
+    def sizes_def(self):
+        """Retrieves the definition of the sizes (m,n,k)"""
+        if self.sizes:
+            return [", ".join(["const size_t " + s for s in self.sizes])]
+        return []
+
+    def sizes_type(self):
+        """Retrieves the types of the sizes (m,n,k)"""
+        if self.sizes:
+            return [", ".join(["const size_t" for s in self.sizes])]
+        return []
+
+    def sizes_doc(self):
+        """# Retrieves the documentation of the sizes"""
+        if self.sizes:
+            definitions = ["`const size_t " + s + "`: Integer size argument. This value must be positive." for s in self.sizes]
+            return definitions
+        return []
+
+    def options_list(self):
+        """Retrieves a list of options"""
+        if self.options:
+            return [", ".join(self.options)]
+        return []
+
+    def options_cast(self, indent):
+        """As above, but now casted to CLBlast data-types"""
+        if self.options:
+            options = ["static_cast<clblast::" + convert.option_to_clblast(o) + ">(" + o + ")" for o in self.options]
+            return [(",\n" + indent).join(options)]
+        return []
+
+    def options_def(self):
+        """Retrieves the definitions of the options (layout, transpose, side, etc.)"""
+        if self.options:
+            definitions = ["const " + convert.option_to_clblast(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
+    def options_def_wrapper_clblas(self):
+        """As above, but now using clBLAS data-types"""
+        if self.options:
+            definitions = ["const " + convert.option_to_clblas(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
+    def options_def_wrapper_cblas(self):
+        """As above, but now using CBLAS data-types"""
+        if self.options:
+            definitions = ["const " + convert.option_to_cblas(o) + " " + o for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
+    def options_type(self):
+        """Retrieves the types of the options (layout, transpose, side, etc.)"""
+        if self.options:
+            definitions = ["const " + convert.option_to_clblast(o) for o in self.options]
+            return [", ".join(definitions)]
+        return []
+
+    def options_doc(self):
+        """Retrieves the documentation of the options"""
+        if self.options:
+            definitions = ["`const " + convert.option_to_clblast(o) + " " + o + "`: " + convert.option_to_documentation(o) for o in self.options]
+            return definitions
+        return []
+
+    def arguments(self):
+        """Retrieves a combination of all the argument names (no types)"""
+        return (self.options_list() + self.sizes_list() +
+                list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) +
+                self.scalar("alpha") +
+                list(chain(*[self.buffer(b) for b in self.buffers_first()])) +
+                self.scalar("beta") +
+                list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
+    def arguments_half(self):
+        """As above, but with conversions from half to float"""
+        return (self.options_list() + self.sizes_list() +
+                list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_half_to_float("alpha") +
+                list(chain(*[self.buffer_bis(b) for b in self.buffers_first()])) +
+                self.scalar_half_to_float("beta") +
+                list(chain(*[self.buffer_bis(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_bis(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
+    def arguments_clcudaapi(self):
+        """Retrieves a combination of all the argument names, with CLCudaAPI casts"""
+        return (self.options_list() + self.sizes_list() +
+                list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_first()])) +
+                self.scalar("alpha") +
+                list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_first()])) +
+                self.scalar("beta") +
+                list(chain(*[self.buffer_clcudaapi(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_clcudaapi(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar(s) for s in self.other_scalars()])))
+
+    def arguments_cast(self, flavour, indent):
+        """As above, but with CLBlast casts"""
+        return (self.options_cast(indent) + self.sizes_list() +
+                list(chain(*[self.buffer(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_use("alpha", flavour) +
+                list(chain(*[self.buffer(b) for b in self.buffers_first()])) +
+                self.scalar_use("beta", flavour) +
+                list(chain(*[self.buffer(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_use(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_wrapper_clblas(self, flavour):
+        """As above, but for the clBLAS wrapper"""
+        return (self.options_list() + self.sizes_list() +
+                list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_use_wrapper("alpha", flavour) +
+                list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_first()])) +
+                self.scalar_use_wrapper("beta", flavour) +
+                list(chain(*[self.buffer_wrapper_clblas(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_wrapper_clblas(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_use_wrapper(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_wrapper_cblas(self, flavour):
+        """As above, but for the CBLAS wrapper"""
+        return (self.options_list() + self.sizes_list() +
+                self.scalar_use_wrapper_cblas("alpha", flavour) +
+                list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_use_wrapper_cblas("beta", flavour) +
+                list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_wrapper_cblas(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_use_wrapper_cblas(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_def(self, flavour):
+        """Retrieves a combination of all the argument definitions"""
+        return (self.options_def() + self.sizes_def() +
+                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_def("alpha", flavour) +
+                list(chain(*[self.buffer_def(b) for b in self.buffers_first()])) +
+                self.scalar_def("beta", flavour) +
+                list(chain(*[self.buffer_def(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_def_wrapper_clblas(self, flavour):
+        """As above, but clBLAS wrapper plain data-types"""
+        return (self.options_def_wrapper_clblas() + self.sizes_def() +
+                list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_first()])) +
+                self.scalar_def_plain("alpha", flavour) +
+                list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_def_plain("beta", flavour) +
+                list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def_wrapper_cl(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_def_wrapper_cblas(self, flavour):
+        """As above, but CBLAS wrapper plain data-types"""
+        return (self.options_def_wrapper_cblas() + self.sizes_def() +
+                list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_first()])) +
+                self.scalar_def_plain("alpha", flavour) +
+                list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_first()])) +
+                self.scalar_def_plain("beta", flavour) +
+                list(chain(*[self.buffer_def_vector(b, flavour) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_def_vector(b, flavour) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_def_plain(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_type(self, flavour):
+        """Retrieves a combination of all the argument types"""
+        return (self.options_type() + self.sizes_type() +
+                list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_type("alpha", flavour) +
+                list(chain(*[self.buffer_type(b) for b in self.buffers_first()])) +
+                self.scalar_type("beta", flavour) +
+                list(chain(*[self.buffer_type(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_type(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_type(s, flavour) for s in self.other_scalars()])))
+
+    def arguments_doc(self):
+        """Retrieves a combination of all the argument types"""
+        return (self.options_doc() + self.sizes_doc() +
+                list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) +
+                list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_first()])) +
+                self.scalar_doc("alpha") +
+                list(chain(*[self.buffer_doc(b) for b in self.buffers_first()])) +
+                self.scalar_doc("beta") +
+                list(chain(*[self.buffer_doc(b) for b in self.buffers_second()])) +
+                list(chain(*[self.buffer_doc(b) for b in self.scalar_buffers_second()])) +
+                list(chain(*[self.scalar_doc(s) for s in self.other_scalars()])))
+
+    def requirements_doc(self):
+        """Retrieves a list of routine requirements for documentation"""
+        return self.requirements
+
+    def routine_header_cpp(self, spaces, default_event):
+        """Retrieves the C++ templated definition for a routine"""
+        indent = " " * (spaces + self.length())
+        result = "template <" + self.template.name + ">\n"
+        result += "StatusCode " + self.name.capitalize() + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def(self.template)])
+        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event" + default_event + ")"
+        return result
+
+    def routine_header_type_cpp(self, spaces):
+        """As above, but now without variable names"""
+        indent = " " * (spaces + self.length())
+        result = "template <" + self.template.name + ">\n"
+        result += "StatusCode " + self.name.capitalize() + "("
+        result += (",\n" + indent).join([a for a in self.arguments_type(self.template)])
+        result += ",\n" + indent + "cl_command_queue*, cl_event*)"
+        return result
+
+    def routine_header_c(self, flavour, spaces, extra_qualifier):
+        """As above, but now for C"""
+        indent = " " * (spaces + self.length())
+        result = "StatusCode" + extra_qualifier + " CLBlast" + flavour.name + self.name + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def(flavour)])
+        result += ",\n" + indent + "cl_command_queue* queue, cl_event* event)"
+        return result
+
+    def routine_header_wrapper_clblas(self, flavour, def_only, spaces):
+        """As above, but now for the clBLAS wrapper"""
+        template = "<" + flavour.template + ">" if self.no_scalars() and not def_only else ""
+        indent = " " * (spaces + self.length() + len(template))
+        result = ""
+        if self.no_scalars():
+            result += "template <"
+            if def_only:
+                result += flavour.name
+            result += ">\n"
+        result += "clblasStatus clblasX" + self.name + template + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_clblas(flavour)])
+        result += ",\n" + indent + "cl_uint num_queues, cl_command_queue *queues"
+        result += ",\n" + indent + "cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
+        return result
+
+    def routine_header_wrapper_cblas(self, flavour, spaces):
+        """As above, but now for the CBLAS wrapper"""
+        indent = " " * (spaces + self.length())
+        result = "void cblasX" + self.name + "("
+        result += (",\n" + indent).join([a for a in self.arguments_def_wrapper_cblas(flavour)]) + ")"
+        return result
diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py
deleted file mode 100644
index 00883776..00000000
--- a/scripts/generator/routine.py
+++ /dev/null
@@ -1,603 +0,0 @@
-#!/usr/bin/env python
-
-# ==================================================================================================
-# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-# project loosely follows the Google C++ styleguide and uses a max-width of 100 characters per line.
-#
-# Author(s):
-#   Cedric Nugteren <www.cedricnugteren.nl>
-#
-# This file contains the 'Routine' class, used in the generator script to generate the CLBlast API
-# interface and implementation.
-#
-# ==================================================================================================
-
-# System modules
-from itertools import chain
-
-# Translates an option name to a CLBlast data-type
-def OptionToCLBlast(x):
-	return {
-	    'layout': "Layout",
-	    'a_transpose': "Transpose",
-	    'b_transpose': "Transpose",
-	    'ab_transpose': "Transpose",
-	    'side': "Side",
-	    'triangle': "Triangle",
-	    'diagonal': "Diagonal",
-	}[x]
-
-# As above, but for clBLAS data-types
-def OptionToWrapperCL(x):
-	return {
-	    'layout': "clblasOrder",
-	    'a_transpose': "clblasTranspose",
-	    'b_transpose': "clblasTranspose",
-	    'ab_transpose': "clblasTranspose",
-	    'side': "clblasSide",
-	    'triangle': "clblasUplo",
-	    'diagonal': "clblasDiag",
-	}[x]
-
-# As above, but for CBLAS data-types
-def OptionToWrapperC(x):
-	return {
-	    'layout': "CBLAS_ORDER",
-	    'a_transpose': "CBLAS_TRANSPOSE",
-	    'b_transpose': "CBLAS_TRANSPOSE",
-	    'ab_transpose': "CBLAS_TRANSPOSE",
-	    'side': "CBLAS_SIDE",
-	    'triangle': "CBLAS_UPLO",
-	    'diagonal': "CBLAS_DIAG",
-	}[x]
-
-# Translates an option name to a documentation string
-def OptionToDoc(x):
-	return {
-	    'layout': "Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.",
-	    'a_transpose': "Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
-	    'b_transpose': "Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
-	    'ab_transpose': "Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.",
-	    'side': "The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).",
-	    'triangle': "The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).",
-	    'diagonal': "The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.",
-	}[x]
-
-# ==================================================================================================
-
-# Class holding routine-specific information (e.g. name, which arguments, which precisions)
-class Routine():
-	def __init__(self, implemented, has_tests, level, name, template, flavours, sizes, options,
-	             inputs, outputs, scalars, scratch, description, details, requirements):
-		self.implemented = implemented
-		self.has_tests = has_tests
-		self.level = level
-		self.name = name
-		self.template = template
-		self.flavours = flavours
-		self.sizes = sizes
-		self.options = options
-		self.inputs = inputs
-		self.outputs = outputs
-		self.scalars = scalars
-		self.scratch = scratch # Scratch buffer (e.g. for xDOT)
-		self.description = description
-		self.details = details
-		self.requirements = requirements
-
-	# List of scalar buffers
-	def ScalarBuffersFirst(self):
-		return ["dot","nrm2","asum","sum","imax","imin"]
-	def ScalarBuffersSecond(self):
-		return ["sa","sb","sc","ss","sd1","sd2","sx1","sy1","sparam"]
-
-	# List of scalars other than alpha and beta
-	def OtherScalars(self):
-		return ["cos","sin"]
-
-	# List of buffers with unsigned int type
-	def IndexBuffers(self):
-		return ["imax","imin"]
-
-	# Lists of input/output buffers not index (integer)
-	def NonIndexInputs(self):
-		buffers = self.inputs[:] # make a copy
-		for i in self.IndexBuffers():
-			if i in buffers: buffers.remove(i)
-		return buffers
-	def NonIndexOutputs(self):
-		buffers = self.outputs[:] # make a copy
-		for i in self.IndexBuffers():
-			if i in buffers: buffers.remove(i)
-		return buffers
-
-	# List of buffers without 'inc' or 'ld'
-	def BuffersWithoutLdInc(self):
-		return self.ScalarBuffersFirst() + self.ScalarBuffersSecond() + ["ap"]
-
-	# Retrieves the number of characters in the routine's name
-	def Length(self):
-		return len(self.name)
-
-	# Retrieves the postfix for a buffer
-	def Postfix(self, name):
-		return "inc" if (name in ["x","y"]) else "ld"
-
-	# Determines whether or not this routine has scalar arguments (alpha/beta)
-	def NoScalars(self):
-		return self.scalars == []
-
-	# Returns the upper-case names of these routines (all flavours)
-	def ShortNames(self):
-		return "/".join([f.name+self.name.upper() for f in self.flavours])
-
-	# As above, but excludes some
-	def ShortNamesTested(self):
-		names = [f.name+self.name.upper() for f in self.flavours]
-		if "H"+self.name.upper() in names: names.remove("H"+self.name.upper())
-		return "/".join(names)
-
-	# Determines which buffers go first (between alpha and beta) and which ones go after
-	def BuffersFirst(self):
-		if self.level == "2b":
-			return ["x","y"]
-		return ["ap","a","b","x"]
-	def BuffersSecond(self):
-		if self.level == "2b":
-			return ["ap","a","b","c"]
-		return ["y","c"]
-
-	# Distinguish between vectors and matrices
-	def BuffersVector(self):
-		return ["x","y"]
-	def BuffersMatrix(self):
-		return ["a","b","c","ap"]
-
-	# ==============================================================================================
-
-	# Retrieves a variable name for a specific input/output vector/matrix (e.g. 'x')
-	def Buffer(self, name):
-		if (name in self.inputs) or (name in self.outputs):
-			a = [name+"_buffer"]
-			b = [name+"_offset"]
-			c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but with a '_bis' suffix for the buffer name
-	def BufferBis(self, name):
-		#if (name in self.IndexBuffers()):
-	#		return self.Buffer(name)
-		if (name in self.inputs) or (name in self.outputs):
-			a = [name+"_buffer_bis"]
-			b = [name+"_offset"]
-			c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but with data-types
-	def BufferDef(self, name):
-		prefix = "const " if (name in self.inputs) else ""
-		if (name in self.inputs) or (name in self.outputs):
-			a = [prefix+"cl_mem "+name+"_buffer"]
-			b = ["const size_t "+name+"_offset"]
-			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but with data-types
-	def BufferDefWrapperCL(self, name, flavour):
-		prefix = "const " if (name in self.inputs) else ""
-		if (name in self.inputs) or (name in self.outputs):
-			a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"]
-			b = ["const size_t "+name+"_offset"]
-			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but as vectors
-	def BufferDefVector(self, name, flavour):
-		prefix = "const " if (name in self.inputs) else ""
-		if (name in self.inputs) or (name in self.outputs):
-			a = [prefix+"std::vector<"+flavour.buffertype+">& "+name+"_buffer"]
-			b = ["const size_t "+name+"_offset"]
-			c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but with Claduc buffers
-	def BufferCladuc(self, name):
-		if (name in self.inputs) or (name in self.outputs):
-			buffertype = "unsigned int" if (name in self.IndexBuffers()) else self.template.buffertype
-			a = ["Buffer<"+buffertype+">("+name+"_buffer)"]
-			b = [name+"_offset"]
-			c = [name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but with a static cast for clBLAS wrapper
-	def BufferWrapperCL(self, name):
-		if (name in self.inputs) or (name in self.outputs):
-			a = [name+"_buffer()"]
-			b = [name+"_offset"]
-			c = []
-			if (name in ["x","y"]):
-				c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
-			elif (name in ["a","b","c"]):
-				c = [name+"_"+self.Postfix(name)]
-			return [", ".join(a+b+c)]
-		return []
-
-	# As above but with a static cast for CBLAS wrapper
-	def BufferWrapperC(self, name, flavour):
-		prefix = "const " if (name in self.inputs) else ""
-		if (name in self.inputs) or (name in self.outputs):
-			if name == "sy1":
-				a = [name+"_buffer["+name+"_offset]"]
-			elif flavour.precision_name in ["C","Z"]:
-				a = ["reinterpret_cast<"+prefix+flavour.buffertype[:-1]+"*>(&"+name+"_buffer["+name+"_offset])"]
-			else:
-				a = ["&"+name+"_buffer["+name+"_offset]"]
-			c = []
-			if (name in ["x","y"]):
-				c = ["static_cast<int>("+name+"_"+self.Postfix(name)+")"]
-			elif (name in ["a","b","c"]):
-				c = [name+"_"+self.Postfix(name)]
-			return [", ".join(a+c)]
-		return []
-
-	# As above, but only data-types
-	def BufferType(self, name):
-		prefix = "const " if (name in self.inputs) else ""
-		if (name in self.inputs) or (name in self.outputs):
-			a = [prefix+"cl_mem"]
-			b = ["const size_t"]
-			c = ["const size_t"] if (name not in self.BuffersWithoutLdInc()) else []
-			return [", ".join(a+b+c)]
-		return []
-
-	# Retrieves the documentation of the buffers
-	def BufferDoc(self, name):
-		prefix = "const " if (name in self.inputs) else ""
-		inout = "input" if (name in self.inputs) else "output"
-		if (name in self.inputs) or (name in self.outputs):
-			math_name = name.upper()+" matrix" if (name in self.BuffersMatrix()) else name+" vector"
-			incld_description = "Leading dimension " if (name in self.BuffersMatrix()) else "Stride/increment "
-			a = ["`"+prefix+"cl_mem "+name+"_buffer`: OpenCL buffer to store the "+inout+" "+math_name+"."]
-			b = ["`const size_t "+name+"_offset`: The offset in elements from the start of the "+inout+" "+math_name+"."]
-			c = ["`const size_t "+name+"_"+self.Postfix(name)+"`: "+incld_description+"of the "+inout+" "+math_name+". This value must be greater than 0."] if (name not in self.BuffersWithoutLdInc()) else []
-			return a+b+c
-		return []
-
-	# ==============================================================================================
-
-	# Retrieves the name of a scalar (alpha/beta)
-	def Scalar(self, name):
-		if (name in self.scalars):
-			return [name]
-		return []
-
-	# As above, but converts from float to half
-	def ScalarHalfToFloat(self, name):
-		if name in self.scalars:
-			return ["HalfToFloat("+name+")"]
-		return []
-
-	# Retrieves the use of a scalar (alpha/beta)
-	def ScalarUse(self, name, flavour):
-		if name in self.scalars:
-			if name == "alpha":
-				return [flavour.UseAlpha()]
-			elif name == "beta":
-				return [flavour.UseBeta()]
-			return [name]
-		return []
-
-	# As above, but for the clBLAS wrapper
-	def ScalarUseWrapper(self, name, flavour):
-		if name in self.scalars:
-			if name == "alpha":
-				return [flavour.UseAlphaCL()]
-			elif name == "beta":
-				return [flavour.UseBetaCL()]
-			return [name]
-		return []
-
-	# As above, but for the CBLAS wrapper
-	def ScalarUseWrapperC(self, name, flavour):
-		if name in self.scalars:
-			if flavour.IsComplex(name):
-				return [name+"_array.data()"]
-			return [name]
-		return []
-
-	# Retrieves the definition of a scalar (alpha/beta)
-	def ScalarDef(self, name, flavour):
-		if name in self.scalars:
-			if name == "alpha":
-				return ["const "+flavour.alpha_cl+" "+name]
-			return ["const "+flavour.beta_cl+" "+name]
-		return []
-
-	# As above, but without 'cl_' prefix
-	def ScalarDefPlain(self, name, flavour):
-		if name in self.scalars:
-			if name == "alpha":
-				return ["const "+flavour.alpha_cpp+" "+name]
-			return ["const "+flavour.beta_cpp+" "+name]
-		return []
-
-	# Retrieves the type of a scalar (alpha/beta)
-	def ScalarType(self, name, flavour):
-		if name in self.scalars:
-			if name == "alpha":
-				return ["const "+flavour.alpha_cpp]
-			return ["const "+flavour.beta_cpp]
-		return []
-
-	# Retrieves the documentation of a scalar
-	def ScalarDoc(self, name):
-		if name in self.scalars:
-			if name == "alpha":
-				return ["`const "+self.template.alpha_cpp+" "+name+"`: Input scalar constant."]
-			return ["`const "+self.template.beta_cpp+" "+name+"`: Input scalar constant."]
-		return []
-
-	# ==============================================================================================
-
-	# Retrieves a list of comma-separated sizes (m, n, k)
-	def Sizes(self):
-		if self.sizes:
-			return [", ".join([s for s in self.sizes])]
-		return []
-
-	# Retrieves the definition of the sizes (m,n,k)
-	def SizesDef(self):
-		if self.sizes:
-			return [", ".join(["const size_t "+s for s in self.sizes])]
-		return []
-
-	# Retrieves the types of the sizes (m,n,k)
-	def SizesType(self):
-		if self.sizes:
-			return [", ".join(["const size_t" for s in self.sizes])]
-		return []
-
-	# Retrieves the documentation of the sizes
-	def SizesDoc(self):
-		if self.sizes:
-			definitions = ["`const size_t "+s+"`: Integer size argument. This value must be positive." for s in self.sizes]
-			return definitions
-		return []
-
-	# ==============================================================================================
-
-	# Retrieves a list of options
-	def Options(self):
-		if self.options:
-			return [", ".join(self.options)]
-		return []
-
-	# As above, but now casted to CLBlast data-types
-	def OptionsCast(self, indent):
-		if self.options:
-			options = ["static_cast<clblast::"+OptionToCLBlast(o)+">("+o+")" for o in self.options]
-			return [(",\n"+indent).join(options)]
-		return []
-
-	# Retrieves the definitions of the options (layout, transpose, side, etc.)
-	def OptionsDef(self):
-		if self.options:
-			definitions = ["const "+OptionToCLBlast(o)+" "+o for o in self.options]
-			return [", ".join(definitions)]
-		return []
-
-	# As above, but now using clBLAS data-types
-	def OptionsDefWrapperCL(self):
-		if self.options:
-			definitions = ["const "+OptionToWrapperCL(o)+" "+o for o in self.options]
-			return [", ".join(definitions)]
-		return []
-
-	# As above, but now using CBLAS data-types
-	def OptionsDefWrapperC(self):
-		if self.options:
-			definitions = ["const "+OptionToWrapperC(o)+" "+o for o in self.options]
-			return [", ".join(definitions)]
-		return []
-
-	# Retrieves the types of the options (layout, transpose, side, etc.)
-	def OptionsType(self):
-		if self.options:
-			definitions = ["const "+OptionToCLBlast(o) for o in self.options]
-			return [", ".join(definitions)]
-		return []
-
-	# Retrieves the documentation of the options
-	def OptionsDoc(self):
-		if self.options:
-			definitions = ["`const "+OptionToCLBlast(o)+" "+o+"`: "+OptionToDoc(o) for o in self.options]
-			return definitions
-		return []
-
-	# ==============================================================================================
-
-	# Retrieves a combination of all the argument names (no types)
-	def Arguments(self):
-		return (self.Options() + self.Sizes() +
-		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
-		        self.Scalar("alpha") +
-		        list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
-		        self.Scalar("beta") +
-		        list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
-
-	# As above, but with conversions from half to float
-	def ArgumentsHalf(self):
-		return (self.Options() + self.Sizes() +
-		        list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarHalfToFloat("alpha") +
-		        list(chain(*[self.BufferBis(b) for b in self.BuffersFirst()])) +
-		        self.ScalarHalfToFloat("beta") +
-		        list(chain(*[self.BufferBis(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferBis(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
-
-	# Retrieves a combination of all the argument names, with Claduc casts
-	def ArgumentsCladuc(self, flavour, indent):
-		return (self.Options() + self.Sizes() +
-		        list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersFirst()])) +
-		        self.Scalar("alpha") +
-		        list(chain(*[self.BufferCladuc(b) for b in self.BuffersFirst()])) +
-		        self.Scalar("beta") +
-		        list(chain(*[self.BufferCladuc(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferCladuc(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.Scalar(s) for s in self.OtherScalars()])))
-
-	# As above, but with CLBlast casts
-	def ArgumentsCast(self, flavour, indent):
-		return (self.OptionsCast(indent) + self.Sizes() +
-		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarUse("alpha", flavour) +
-		        list(chain(*[self.Buffer(b) for b in self.BuffersFirst()])) +
-		        self.ScalarUse("beta", flavour) +
-		        list(chain(*[self.Buffer(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.Buffer(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarUse(s, flavour) for s in self.OtherScalars()])))
-
-	# As above, but for the clBLAS wrapper
-	def ArgumentsWrapperCL(self, flavour):
-		return (self.Options() + self.Sizes() +
-		        list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarUseWrapper("alpha", flavour) +
-		        list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersFirst()])) +
-		        self.ScalarUseWrapper("beta", flavour) +
-		        list(chain(*[self.BufferWrapperCL(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferWrapperCL(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarUseWrapper(s, flavour) for s in self.OtherScalars()])))
-
-	# As above, but for the CBLAS wrapper
-	def ArgumentsWrapperC(self, flavour):
-		return (self.Options() + self.Sizes() +
-		        self.ScalarUseWrapperC("alpha", flavour) +
-		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersFirst()])) +
-		        self.ScalarUseWrapperC("beta", flavour) +
-		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferWrapperC(b, flavour) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarUseWrapperC(s, flavour) for s in self.OtherScalars()])))
-
-	# Retrieves a combination of all the argument definitions
-	def ArgumentsDef(self, flavour):
-		return (self.OptionsDef() + self.SizesDef() +
-		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarDef("alpha", flavour) +
-		        list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) +
-		        self.ScalarDef("beta", flavour) +
-		        list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarDef(s, flavour) for s in self.OtherScalars()])))
-
-	# As above, but clBLAS wrapper plain datatypes
-	def ArgumentsDefWrapperCL(self, flavour):
-		return (self.OptionsDefWrapperCL() + self.SizesDef() +
-		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarDefPlain("alpha", flavour) +
-		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) +
-		        self.ScalarDefPlain("beta", flavour) +
-		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
-
-	# As above, but CBLAS wrapper plain datatypes
-	def ArgumentsDefWrapperC(self, flavour):
-		return (self.OptionsDefWrapperC() + self.SizesDef() +
-		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarDefPlain("alpha", flavour) +
-		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersFirst()])) +
-		        self.ScalarDefPlain("beta", flavour) +
-		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferDefVector(b, flavour) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()])))
-	
-	# Retrieves a combination of all the argument types
-	def ArgumentsType(self, flavour):
-		return (self.OptionsType() + self.SizesType() +
-		        list(chain(*[self.BufferType(b) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarType("alpha", flavour) +
-		        list(chain(*[self.BufferType(b) for b in self.BuffersFirst()])) +
-		        self.ScalarType("beta", flavour) +
-		        list(chain(*[self.BufferType(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferType(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarType(s, flavour) for s in self.OtherScalars()])))
-	
-	# Retrieves a combination of all the argument types
-	def ArgumentsDoc(self):
-		return (self.OptionsDoc() + self.SizesDoc() +
-		        list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
-		        list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersFirst()])) +
-		        self.ScalarDoc("alpha") +
-		        list(chain(*[self.BufferDoc(b) for b in self.BuffersFirst()])) +
-		        self.ScalarDoc("beta") +
-		        list(chain(*[self.BufferDoc(b) for b in self.BuffersSecond()])) +
-		        list(chain(*[self.BufferDoc(b) for b in self.ScalarBuffersSecond()])) +
-		        list(chain(*[self.ScalarDoc(s) for s in self.OtherScalars()])))
-
-	# ==============================================================================================
-
-	# Retrieves a list of routine requirements for documentation
-	def RequirementsDoc(self):
-		return self.requirements
-
-	# ==============================================================================================
-
-	# Retrieves the C++ templated definition for a routine
-	def RoutineHeaderCPP(self, spaces, default_event):
-		indent = " "*(spaces + self.Length())
-		result = "template <"+self.template.name+">\n"
-		result += "StatusCode "+self.name.capitalize()+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsDef(self.template)])
-		result += ",\n"+indent+"cl_command_queue* queue, cl_event* event"+default_event+")"
-		return result
-
-	# As above, but now without variable names
-	def RoutineHeaderTypeCPP(self, spaces):
-		indent = " "*(spaces + self.Length())
-		result = "template <"+self.template.name+">\n"
-		result += "StatusCode "+self.name.capitalize()+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsType(self.template)])
-		result += ",\n"+indent+"cl_command_queue*, cl_event*)"
-		return result
-
-	# As above, but now for C
-	def RoutineHeaderC(self, flavour, spaces, extra_qualifier):
-		indent = " "*(spaces + self.Length())
-		result = "StatusCode"+extra_qualifier+" CLBlast"+flavour.name+self.name+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsDef(flavour)])
-		result += ",\n"+indent+"cl_command_queue* queue, cl_event* event)"
-		return result
-
-	# As above, but now for the clBLAS wrapper
-	def RoutineHeaderWrapperCL(self, flavour, def_only, spaces):
-		template = "<"+flavour.template+">" if self.NoScalars() and not def_only else ""
-		indent = " "*(spaces + self.Length() + len(template))
-		result = ""
-		if self.NoScalars():
-			result += "template <"
-			if def_only:
-				result += flavour.name
-			result += ">\n"
-		result += "clblasStatus clblasX"+self.name+template+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperCL(flavour)])
-		result += ",\n"+indent+"cl_uint num_queues, cl_command_queue *queues"
-		result += ",\n"+indent+"cl_uint num_wait_events, const cl_event *wait_events, cl_event *events)"
-		return result
-
-	# As above, but now for the CBLAS wrapper
-	def RoutineHeaderWrapperC(self, flavour, def_only, spaces):
-		indent = " "*(spaces + self.Length())
-		result = "void cblasX"+self.name+"("
-		result += (",\n"+indent).join([a for a in self.ArgumentsDefWrapperC(flavour)])+")"
-		return result
-
-# ==================================================================================================
-- 
cgit v1.2.3


From 55038d3c919a6584e5e5891e2290c67698f3c90d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 6 Sep 2016 20:30:06 +0200
Subject: Split GEMM tuning in two parts: a small set of tuning parameters
 which is explored exhaustively and a larger set which is explored randomly

---
 src/tuning/kernels/xgemm.cpp | 85 +++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index eb7c8a66..7c9ac76a 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -7,7 +7,9 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels.
+// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
 // =================================================================================================
 
@@ -21,12 +23,12 @@ namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
-template <typename T>
+template <typename T, int V>
 class TuneXgemm {
  public:
 
   // The representative kernel and the source code
-  static std::string KernelFamily() { return "xgemm"; }
+  static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; }
   static std::string KernelName() { return "Xgemm"; }
   static std::string GetSources() {
     return
@@ -48,7 +50,7 @@ class TuneXgemm {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
-  static double DefaultFraction() { return 256.0; }
+  static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -60,20 +62,38 @@ class TuneXgemm {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
-    tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
-    tuner.AddParameter(id, "KWG", {16, 32});
-    tuner.AddParameter(id, "MDIMC", {8, 16, 32});
-    tuner.AddParameter(id, "NDIMC", {8, 16, 32});
-    tuner.AddParameter(id, "MDIMA", {8, 16, 32});
-    tuner.AddParameter(id, "NDIMB", {8, 16, 32});
-    tuner.AddParameter(id, "KWI", {2});
-    tuner.AddParameter(id, "VWM", {1, 2, 4});
-    tuner.AddParameter(id, "VWN", {1, 2, 4});
-    tuner.AddParameter(id, "STRM", {0, 1});
-    tuner.AddParameter(id, "STRN", {0, 1});
-    tuner.AddParameter(id, "SA", {0, 1});
-    tuner.AddParameter(id, "SB", {0, 1});
+    if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+      tuner.AddParameter(id, "MWG", {16, 32, 64});
+      tuner.AddParameter(id, "NWG", {16, 32, 64});
+      tuner.AddParameter(id, "KWG", {32});
+      tuner.AddParameter(id, "MDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMA", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMB", {8, 16, 32});
+      tuner.AddParameter(id, "KWI", {2});
+      tuner.AddParameter(id, "VWM", {1, 2, 4});
+      tuner.AddParameter(id, "VWN", {1, 2, 4});
+      tuner.AddParameter(id, "STRM", {0});
+      tuner.AddParameter(id, "STRN", {0});
+      tuner.AddParameter(id, "SA", {0, 1});
+      tuner.AddParameter(id, "SB", {0, 1});
+    } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+    else {
+      tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
+      tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
+      tuner.AddParameter(id, "KWG", {16, 32});
+      tuner.AddParameter(id, "MDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMA", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMB", {8, 16, 32});
+      tuner.AddParameter(id, "KWI", {2});
+      tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
+      tuner.AddParameter(id, "STRM", {0, 1});
+      tuner.AddParameter(id, "STRN", {0, 1});
+      tuner.AddParameter(id, "SA", {0, 1});
+      tuner.AddParameter(id, "SB", {0, 1});
+    }
   }
 
   // Sets the constraints
@@ -92,6 +112,14 @@ class TuneXgemm {
     // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
     tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
     tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+
+    // Extra constraints for variation 1 to limit the set of options significantly
+    if (V==1) {
+      auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+      tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"});
+      tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"});
+      tuner.AddConstraint(id, IsEqual, {"SA", "SB"});
+    }
   }
 
   // Sets the local memory size
@@ -145,15 +173,22 @@ class TuneXgemm {
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
-    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
-    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2>, double2>(argc, argv); break;
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half,V>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float,V>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double,V>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2,V>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2,V>, double2>(argc, argv); break;
   }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
   return 0;
 }
 
-- 
cgit v1.2.3


From 3daba709974fa2b13b2c44be8e555f2bc6d8356a Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 10 Sep 2016 11:12:09 +0200
Subject: Updated the database script to remove duplicate entries: keeps only
 the best-performing cases for a specific parameters combination

---
 scripts/database/database.py          |  2 +-
 scripts/database/database/db.py       | 30 +++++++++++++++++++++++++-----
 scripts/database/database/defaults.py | 10 ----------
 src/database/kernels/copy.hpp         |  2 +-
 src/database/kernels/xaxpy.hpp        |  8 ++++----
 src/database/kernels/xgemv.hpp        |  2 +-
 src/database/kernels/xgemv_fast.hpp   |  8 ++++----
 7 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/scripts/database/database.py b/scripts/database/database.py
index 6d370d99..944c1bd6 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -77,12 +77,12 @@ def main(argv):
         # Adds the new data to the database
         old_size = len(database.index)
         database = db.concatenate_database(database, imported_data)
-        database = db.remove_duplicates(database)
         new_size = len(database.index)
         print("with " + str(new_size - old_size) + " new items")  # Newline printed here
 
     # Stores the modified database back to disk
     if len(glob.glob(json_files)) >= 1:
+        database = db.remove_duplicates(database)
         io.save_database(database, database_filename)
 
     # Optional: update the database here. Default is disabled, code below is just an example
diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py
index 60cfbcfa..6534d689 100644
--- a/scripts/database/database/db.py
+++ b/scripts/database/database/db.py
@@ -6,6 +6,7 @@
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
 import pandas as pd
+import numpy as np
 
 
 def get_entries_by_field(database, field, value):
@@ -18,11 +19,6 @@ def concatenate_database(database1, database2):
     return pd.concat([database1, database2])
 
 
-def remove_duplicates(database):
-    """Removes duplicates from a database"""
-    return database.drop_duplicates()
-
-
 def find_and_replace(database, dictionary):
     """Finds and replaces entries in a database based on a dictionary. Example:
     dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }"""
@@ -48,3 +44,27 @@ def update_database(database, condition, field, value):
     """Updates the database by writing a specific value to a given field, given certain conditions"""
     database.loc[condition, field] = value
     return database
+
+
+def remove_duplicates(database):
+    """Removes duplicates from the database based on all but the 'time' column"""
+
+    # First remove 100% duplicate entries
+    database = database.drop_duplicates()
+
+    # Replace NaNs with -1 first (needed for groupby)
+    database = database.replace(np.nan, -1)
+
+    # In case multiple runs for the exact same configuration where made: take just the best performing one into account
+    other_column_names = list(database.columns.values)
+    other_column_names.remove("time")
+    database_by_time = database.groupby(other_column_names,)
+    num_removals = len(database) - len(database_by_time)
+    if num_removals > 0:
+        print("[database] Removing %d entries: keeping only those with the lowest execution time" % num_removals)
+        print("[database] Note: this might take a while")
+        database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
+
+    # Re-replace the NaN values
+    database = database.replace(-1, np.nan)
+    return database
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index 3bde33c1..d71e604f 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -81,16 +81,6 @@ def get_common_best(database, group_name, verbose):
 
     # Removes columns without any values
     database = database.dropna(axis=1, how='all')
-    database = database.reset_index()
-
-    # In case multiple runs for the exact same configuration where made: take just the best performing one into account
-    other_column_names = list(database.columns.values)
-    other_column_names.remove("time")
-    database_by_time = database.groupby(other_column_names)
-    if len(database_by_time) != len(database):
-        if verbose:
-            print("[database] " + str(group_name) + " keeping only entries with the lowest execution time")
-        database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
 
     # Inserts the relative execution times into the database
     def relative_performance(x):
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index dc2011fd..a6b7dfe8 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -87,7 +87,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
-        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 60fa7555..6e84ca5a 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -79,10 +79,10 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 1070",                                { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 480",                                 { {"VW",4}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 680",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
@@ -209,7 +209,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",8} } },
-        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 7e8e64e3..03e84525 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index f5e3e630..c12fcdca 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -60,7 +60,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
     { // Intel accelerators
@@ -88,7 +88,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
   }
@@ -123,7 +123,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
-        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
     { // Intel accelerators
@@ -145,7 +145,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW2",1}, {"WGS2",32}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
   }
-- 
cgit v1.2.3


From e21f32bc9928f87a8d0ff15797e2ed2ab65ded58 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 10 Sep 2016 14:00:43 +0200
Subject: Updated database based on exhaustive tuning results for GEMM for the
 R9 M370X GPU

---
 scripts/database/database.py   | 1 +
 src/database/kernels/xgemm.hpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/database/database.py b/scripts/database/database.py
index 944c1bd6..5c859487 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -77,6 +77,7 @@ def main(argv):
         # Adds the new data to the database
         old_size = len(database.index)
         database = db.concatenate_database(database, imported_data)
+        database = database.drop_duplicates()
         new_size = len(database.index)
         print("with " + str(new_size - old_size) + " new items")  # Newline printed here
 
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index c960592d..cc81cf6a 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -158,7 +158,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
   "Xgemm", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
-- 
cgit v1.2.3


From b5a67f86ecca72b47fc3d0a8231f902752b13c3d Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sun, 11 Sep 2016 21:29:28 +0200
Subject: Complete re-write of the database script. Changed Pandas for the much
 faster and convienient plain JSON/dict data-type

---
 .gitignore                            |   3 +-
 scripts/database/database.py          |  46 +++----
 scripts/database/database/bests.py    |  67 +++++++----
 scripts/database/database/clblast.py  |  42 ++++---
 scripts/database/database/db.py       |  90 +++++++-------
 scripts/database/database/defaults.py | 220 +++++++++++++++++++++++-----------
 scripts/database/database/io.py       |  46 +++----
 src/database/kernels/xaxpy.hpp        |   4 +-
 src/database/kernels/xdot.hpp         |   8 +-
 src/database/kernels/xgemm.hpp        |   2 +-
 src/database/kernels/xgemv.hpp        |   4 +-
 src/database/kernels/xgemv_fast.hpp   |   4 +-
 12 files changed, 309 insertions(+), 227 deletions(-)

diff --git a/.gitignore b/.gitignore
index bcb32754..8ccab476 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,6 @@ build
 stash
 .*
 *.pyc
-*.db
+database.json
+database_best.json
 cl.hpp
\ No newline at end of file
diff --git a/scripts/database/database.py b/scripts/database/database.py
index 5c859487..f758a2b7 100755
--- a/scripts/database/database.py
+++ b/scripts/database/database.py
@@ -11,8 +11,6 @@ import os.path
 import glob
 import argparse
 
-import pandas as pd
-
 import database.io as io
 import database.db as db
 import database.clblast as clblast
@@ -20,15 +18,15 @@ import database.bests as bests
 import database.defaults as defaults
 
 # Server storing a copy of the database
-DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.db"
+DATABASE_SERVER_URL = "http://www.cedricnugteren.nl/tuning/clblast.json"
 
 # OpenCL vendor names and their short name
-VENDOR_TRANSLATION_TABLE = {"device_vendor": {
+VENDOR_TRANSLATION_TABLE = {
   "GenuineIntel": "Intel",
   "Intel(R) Corporation": "Intel",
   "Advanced Micro Devices, Inc.": "AMD",
   "NVIDIA Corporation": "NVIDIA",
-}}
+}
 
 
 def main(argv):
@@ -41,7 +39,8 @@ def main(argv):
     cl_args = parser.parse_args(argv)
 
     # Parses the path arguments
-    database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.db")
+    database_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database.json")
+    database_best_filename = os.path.join(cl_args.clblast_root, "scripts", "database", "database_best.json")
     json_files = os.path.join(cl_args.source_folder, "*.json")
     cpp_database_path = os.path.join(cl_args.clblast_root, "src", "database", "kernels")
 
@@ -52,11 +51,6 @@ def main(argv):
     if len(glob.glob(json_files)) < 1:
         print("[database] The path '" + cl_args.source_folder + "' does not contain any JSON files")
 
-    # Pandas options
-    pd.set_option('display.width', 1000)
-    if cl_args.verbose:
-        print("[database] Using pandas version " + pd.__version__)
-
     # Downloads the database if a local copy is not present
     if not os.path.isfile(database_filename):
         io.download_database(database_filename, DATABASE_SERVER_URL)
@@ -68,30 +62,22 @@ def main(argv):
     for file_json in glob.glob(json_files):
 
         # Loads the newly imported data
-        sys.stdout.write("[database] Processing '"+file_json+"' ")  # No newline printed
-        imported_data = io.load_json_to_pandas(file_json)
+        sys.stdout.write("[database] Processing '" + file_json + "' ")  # No newline printed
+        imported_data = io.load_tuning_results(file_json)
 
         # Fixes the problem that some vendors use multiple different names
-        imported_data = db.find_and_replace(imported_data, VENDOR_TRANSLATION_TABLE)
+        for target in VENDOR_TRANSLATION_TABLE:
+            if imported_data["device_vendor"] == target:
+                imported_data["device_vendor"] = VENDOR_TRANSLATION_TABLE[target]
 
         # Adds the new data to the database
-        old_size = len(database.index)
-        database = db.concatenate_database(database, imported_data)
-        database = database.drop_duplicates()
-        new_size = len(database.index)
+        old_size = db.length(database)
+        database = db.add_section(database, imported_data)
+        new_size = db.length(database)
         print("with " + str(new_size - old_size) + " new items")  # Newline printed here
 
     # Stores the modified database back to disk
     if len(glob.glob(json_files)) >= 1:
-        database = db.remove_duplicates(database)
-        io.save_database(database, database_filename)
-
-    # Optional: update the database here. Default is disabled, code below is just an example
-    if False:  # TODO: Use command-line arguments to enable updates in a flexible way
-        database = db.update_database(database,
-                                      ((database["kernel"] == "CopyMatrixFast") &
-                                       (database["precision"] == "3232")),
-                                      "arg_alpha", "2+0.5i")
         io.save_database(database, database_filename)
 
     # Retrieves the best performing results
@@ -101,7 +87,11 @@ def main(argv):
     # Determines the defaults for other vendors and per vendor
     print("[database] Calculating the default values...")
     database_defaults = defaults.calculate_defaults(database, cl_args.verbose)
-    database_best_results = db.concatenate_database(database_best_results, database_defaults)
+    database_best_results["sections"].extend(database_defaults["sections"])
+
+    # Optionally outputs the database to disk
+    if cl_args.verbose:
+        io.save_database(database_best_results, database_best_filename)
 
     # Outputs the database as a C++ database
     print("[database] Producing a C++ database in '" + cpp_database_path + "'...")
diff --git a/scripts/database/database/bests.py b/scripts/database/database/bests.py
index e6239258..c924efde 100644
--- a/scripts/database/database/bests.py
+++ b/scripts/database/database/bests.py
@@ -5,39 +5,54 @@
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
-import pandas as pd
-import clblast
+import sys
 
 
-def get_best_results(df):
-    """Retrieves the results with the lowests execution times"""
-    database_bests = pd.DataFrame()
-    database_entries = df.groupby(clblast.ATTRIBUTES + ["kernel"])
-    for name, database_entry in database_entries:
-        best_time = database_entry["time"].min()
-        best_parameters = database_entry[database_entry["time"] == best_time].iloc[0]
-        database_bests = database_bests.append(best_parameters, ignore_index=True)
-    return database_bests
+def get_best_results(database):
+    """Retrieves the results with the lowest execution times"""
+    sections_best = []
+    for section in database["sections"]:
+        section_best = {}
 
+        # Stores all the section's meta data
+        for attribute in section.keys():
+            if attribute != "results":
+                section_best[attribute] = section[attribute]
 
-def get_relative_bests(df, parameter_column_names, name, verbose=False):
-    """Retrieves the relative best execution time over different devices"""
+        # Find the best result
+        parameters_best = None
+        time_best = sys.float_info.max
+        for result in section["results"]:
+            if result["time"] < time_best:
+                time_best = result["time"]
+                parameters_best = result["parameters"]
+
+        # Stores the best result
+        section_best["results"] = [{"time": time_best, "parameters": parameters_best}]
+        sections_best.append(section_best)
+
+    return {"sections": sections_best}
 
-    # Computes the sum of the execution times over the different devices
-    def sum_performance(x):
-        x["group_performance"] = x["relative_performance"].sum()
-        return x
-    df = df.groupby(parameter_column_names).apply(sum_performance)
 
-    # Retrieves the entries with the highest performance
-    best_performance = df["group_performance"].max()
-    df_bests = df[df["group_performance"] == best_performance]
+def get_relative_bests(name, common_results, common_parameters, verbose=False):
+    """Retrieves the parameters with the relative best execution time over different devices"""
+
+    # Helper function
+    def argmax(iterable):
+        return max(enumerate(iterable), key=lambda x: x[1])[0]
+
+    # Computes the sum of the execution times over the different devices
+    performance_sums = []
+    for parameters in common_parameters:
+        performance_sum = sum([r["relative_performance"] for r in common_results if r["parameters"] == parameters])
+        performance_sums.append(performance_sum)
 
-    # Retrieves one example only (the parameters are the same anyway)
-    df_bests = df_bests.drop_duplicates(["group_performance"])
+    # Retrieves the entry with the highest performance
+    best_index = argmax(performance_sums)
+    best_performance = performance_sums[best_index]
+    best_parameters = common_parameters[best_index]
 
     # Completed, report and return the results
     if verbose:
-        print("[database] " + str(name) + " with performance " + str(best_performance) + " " + str(df_bests.shape))
-    assert len(df_bests) == 1
-    return df_bests
+        print("[database] " + str(name) + " with performance " + str(best_performance))
+    return best_parameters
diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index 46b711cc..beed46d9 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -18,6 +18,7 @@ DEVICE_ATTRIBUTES = ["device", "device_core_clock", "device_compute_units"]
 KERNEL_ATTRIBUTES = ["precision", "kernel_family"]
 ARGUMENT_ATTRIBUTES = ["arg_m", "arg_n", "arg_k", "arg_alpha", "arg_beta"]
 ATTRIBUTES = DEVICE_ATTRIBUTES + DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ARGUMENT_ATTRIBUTES
+GROUP_ATTRIBUTES = DEVICE_TYPE_ATTRIBUTES + KERNEL_ATTRIBUTES + ["kernel"] + ARGUMENT_ATTRIBUTES
 
 
 def precision_to_string(precision):
@@ -81,42 +82,51 @@ def print_cpp_database(database, output_dir):
     """Outputs the database as C++ code"""
 
     # Iterates over the kernel families
-    for family_name, family_database in database.groupby(["kernel_family"]):
-        family_database = family_database.dropna(axis=1, how='all')
+    kernel_families = [s["kernel_family"] for s in database["sections"]]
+    for family_name in kernel_families:
+        family_database = [s for s in database["sections"] if s["kernel_family"] == family_name]
 
         # Opens a new file for each kernel family
-        full_path = os.path.join(output_dir, family_name+'.hpp')
+        full_path = os.path.join(output_dir, family_name + ".hpp")
         with open(full_path, 'w+') as f:
             f.write(get_cpp_header(family_name))
 
             # Loops over the different precision (e.g. 16, 32, 3232, 64, 6464)
-            for precision, precision_database in family_database.groupby(["precision"]):
+            precisions = sorted(set([s["precision"] for s in family_database]))
+            for precision in precisions:
+                precision_database = [s for s in family_database if s["precision"] == precision]
                 f.write(get_cpp_precision(family_name, precision))
 
                 # Loops over a combination of device vendors and device types (e.g. AMD GPU)
-                for vendor, vendor_database in precision_database.groupby(["device_vendor"]):
-                    for device_type, device_type_database in vendor_database.groupby(["device_type"]):
+                device_vendors = sorted(set([s["device_vendor"] for s in precision_database]))
+                for vendor in device_vendors:
+                    vendor_database = [s for s in precision_database if s["device_vendor"] == vendor]
+                    device_types = sorted(set([s["device_type"] for s in vendor_database]))
+                    for device_type in device_types:
+                        type_database = [s for s in vendor_database if s["device_type"] == device_type]
                         f.write(get_cpp_device_vendor(vendor, device_type))
 
                         # Loops over every device of this vendor-type combination
-                        for device_name, device_database in device_type_database.groupby(["device"]):
+                        devices = sorted(set([s["device"] for s in type_database]))
+                        for device_name in devices:
+                            device_database = [s for s in type_database if s["device"] == device_name]
                             device_name_quoted = "\"%s\"," % device_name
                             device_name_cpp = "        { %-50s { " % device_name_quoted
                             f.write(device_name_cpp)
 
                             # Collects the parameters for this entry
                             parameters = []
-                            for kernel, kernel_database in device_database.groupby(["kernel"]):
-                                kernel_database = kernel_database.dropna(axis=1)
+                            kernels = sorted(set([s["kernel"] for s in device_database]))
+                            for kernel in kernels:
+                                kernel_database = [s for s in device_database if s["kernel"] == kernel]
 
-                                # Only consider the actual parameters, not the precision
-                                def is_parameter(column):
-                                    return column.startswith('parameters.') and column != "parameters.PRECISION"
-                                column_names = [col for col in list(kernel_database) if is_parameter(col)]
+                                assert len(kernel_database) == 1
+                                results = kernel_database[0]["results"]
 
-                                for p in column_names:
-                                    parameter_name = p.replace("parameters.", "")
-                                    parameter_value = int(kernel_database[p].iloc[0])
+                                assert len(results) == 1
+                                new_parameters = results[0]["parameters"]
+                                for parameter_name in sorted(new_parameters):
+                                    parameter_value = new_parameters[parameter_name]
                                     parameters.append("{\"" + parameter_name + "\"," + str(parameter_value) + "}")
 
                             # Prints the entry
diff --git a/scripts/database/database/db.py b/scripts/database/database/db.py
index 6534d689..94948b1a 100644
--- a/scripts/database/database/db.py
+++ b/scripts/database/database/db.py
@@ -5,66 +5,60 @@
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
-import pandas as pd
-import numpy as np
+import clblast
 
 
-def get_entries_by_field(database, field, value):
-    """Retrieves entries from the database with a specific value for a given field"""
-    return database[database[field] == value]
+def length(database):
+    """Computes the total number of tuning entries"""
+    num_tuning_entries = 0
+    for section in database["sections"]:
+        num_tuning_entries += len(section["results"])
+    return num_tuning_entries
 
 
-def concatenate_database(database1, database2):
-    """Concatenates two databases row-wise and returns the result"""
-    return pd.concat([database1, database2])
+def add_section(database, new_section):
+    """Adds a new section to the database"""
+    for old_section in database["sections"]:
 
+        # Verify whether the sections match
+        equal = True
+        for attribute in new_section.keys():
+            if attribute != "results":
+                if attribute not in old_section or new_section[attribute] != old_section[attribute]:
+                    equal = False
+                    break
 
-def find_and_replace(database, dictionary):
-    """Finds and replaces entries in a database based on a dictionary. Example:
-    dictionary = { "key_to_edit": { find1: replace1, find2, replace2 } }"""
-    return database.replace(dictionary)
+        # They match: append the new section's results to the corresponding entry in the database and return
+        if equal:
+            old_section["results"] = combine_results(old_section["results"], new_section["results"])
+            return database
 
-
-def remove_entries_by_key_value(database, key, value):
-    """Removes entries in the databased which have a specific value for a given key"""
-    return database[database[key] != value]
-
-
-def remove_entries_by_device(database, device_name):
-    """Shorthand for the above, specifically removes entries for a given device"""
-    return remove_entries_by_key_value(database, "device", device_name)
-
-
-def remove_entries_by_kernel_family(database, kernel_family_name):
-    """Shorthand for the above, specifically removes entries for a given kernel family"""
-    return remove_entries_by_key_value(database, "kernel_family", kernel_family_name)
+    # No match found: append the whole new section to the database
+    database["sections"].append(new_section)
+    return database
 
 
-def update_database(database, condition, field, value):
-    """Updates the database by writing a specific value to a given field, given certain conditions"""
-    database.loc[condition, field] = value
-    return database
+def combine_results(old_results, new_results):
+    """Adds new results to the results JSON list"""
+    for new_result in new_results:
+        old_results = combine_result(old_results, new_result)
+    return old_results
 
 
-def remove_duplicates(database):
-    """Removes duplicates from the database based on all but the 'time' column"""
+def combine_result(old_results, new_result):
+    """Adds a new result to the results JSON list; filters for duplicate entries and saves the best performing one"""
 
-    # First remove 100% duplicate entries
-    database = database.drop_duplicates()
+    # Loops over all existing results to test for already existing entries with these parameters
+    for old_result in old_results:
 
-    # Replace NaNs with -1 first (needed for groupby)
-    database = database.replace(np.nan, -1)
+        # Verify whether the results match
+        equal = new_result["parameters"] == old_result["parameters"]
 
-    # In case multiple runs for the exact same configuration where made: take just the best performing one into account
-    other_column_names = list(database.columns.values)
-    other_column_names.remove("time")
-    database_by_time = database.groupby(other_column_names,)
-    num_removals = len(database) - len(database_by_time)
-    if num_removals > 0:
-        print("[database] Removing %d entries: keeping only those with the lowest execution time" % num_removals)
-        print("[database] Note: this might take a while")
-        database = database_by_time.apply(lambda x: x[x["time"] == x["time"].min()])
+        # They match: keep only the one with the minimum execution time
+        if equal:
+            old_result["time"] = min(old_result["time"], new_result["time"])
+            return old_results
 
-    # Re-replace the NaN values
-    database = database.replace(-1, np.nan)
-    return database
+    # No match found: append a new result
+    old_results.append(new_result)
+    return old_results
diff --git a/scripts/database/database/defaults.py b/scripts/database/database/defaults.py
index d71e604f..00405908 100644
--- a/scripts/database/database/defaults.py
+++ b/scripts/database/database/defaults.py
@@ -5,102 +5,176 @@
 # Author(s):
 #   Cedric Nugteren <www.cedricnugteren.nl>
 
-import pandas as pd
 
 import clblast
 import bests
 
 
-def set_default_device(database_entry):
+def set_default_device(section):
     """Sets the device name and parameters to some default values"""
-    database_entry["device"] = clblast.DEVICE_NAME_DEFAULT
-    database_entry["device_compute_units"] = 0
-    database_entry["device_core_clock"] = 0
-    return database_entry
-
-
-def set_default_time(database_entry):
-    """Sets the execution time to some default value"""
-    database_entry["time"] = 0.0
-    return database_entry
-
-
-def calculate_defaults(database, verbose, calculate_common_best=True):
-    """Sets defaults for devices of the same type/vendor. An option determines how to compute the defaults."""
-    database_defaults = pd.DataFrame()
-
-    # Defaults per combination of device vendors and device types (e.g. AMD GPU)
-    database_type_vendor = database.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"] +
-                                            clblast.ARGUMENT_ATTRIBUTES)
-    for group_name, database_group in database_type_vendor:
-        if calculate_common_best:
-            default_values = get_common_best(database_group, group_name, verbose)
-        else:
-            default_values = get_smallest_best(database_group)
-        default_values = set_default_device(default_values)
-        default_values = set_default_time(default_values)
-        database_defaults = database_defaults.append(default_values, ignore_index=True)
-
-    # Checks for mis-matched arguments
-    groups = database_defaults.groupby(clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"])
-    for group_name, database_group in groups:
-        if len(database_group) != 1:
-            print("[WARNING] Entries for a single kernel with multiple argument values: " + str(group_name))
-
-    # Defaults over all device types and vendors
-    groups = database.groupby(clblast.KERNEL_ATTRIBUTES + ["kernel"] + clblast.ARGUMENT_ATTRIBUTES)
-    for group_name, database_group in groups:
-        if calculate_common_best:
-            default_values = get_common_best(database_group, group_name, verbose)
-        else:
-            default_values = get_smallest_best(database_group)
-        default_values["device_vendor"] = clblast.VENDOR_DEFAULT
-        default_values["device_type"] = clblast.DEVICE_TYPE_DEFAULT
-        default_values = set_default_device(default_values)
-        default_values = set_default_time(default_values)
-        database_defaults = database_defaults.append(default_values, ignore_index=True)
+    section["device"] = clblast.DEVICE_NAME_DEFAULT
+    section["device_compute_units"] = 0
+    section["device_core_clock"] = 0
+    return section
+
+
+def set_identifiers(database, group_by_attributes, identifier_name):
+    """Sets a group-identifier based on a given set of attributes. Modifies the database but also returns a list of
+    unique identifiers."""
+    identifiers = []
+    for section in database["sections"]:
+        identifier = []
+        for attribute in group_by_attributes:
+            if attribute in section:
+                identifier.append(section[attribute])
+        section[identifier_name] = ";".join(identifier)
+        identifiers.append(section[identifier_name])
+    return sorted(set(identifiers))
+
+
+def remove_identifiers(database, identifier_name):
+    """Removes an identifier from all sections in the database"""
+    for section in database["sections"]:
+        section.pop(identifier_name, None)
+
+
+def get_groups_by_identifier(database, group_identifiers, identifier_name):
+    """Returns a list of (group, group_identifier) tuples based a previously made grouping"""
+    groups = []
+    for group_identifier in group_identifiers:
+
+        # Get all sections in this group
+        group = []
+        for section in database["sections"]:
+            if section[identifier_name] == group_identifier:
+                group.append(section)
+
+        groups.append((group, group_identifier))
+    return groups
+
+
+def calculate_defaults(database, verbose):
+    """Sets defaults for devices of the same type/vendor"""
+
+    # Groups the database by kernel, vendor and device type (e.g. AMD GPU)
+    group_identifiers = set_identifiers(database, clblast.GROUP_ATTRIBUTES, "group_identifier")
+    groups = get_groups_by_identifier(database, group_identifiers, "group_identifier")
+
+    # Loops over all groups
+    default_sections = {"sections": []}
+    for group, group_identifier in groups:
+
+        # Computes the best parameters
+        default_parameters = get_common_best_parameters(group, group_identifier, verbose)
+
+        # Stores all the section's data
+        assert len(group) > 0
+        default_section = {}
+        for attribute in group[0].keys():
+            if attribute != "results" and attribute != "group_identifier":
+                default_section[attribute] = group[0][attribute]
+        default_section = set_default_device(default_section)
+        default_section["results"] = [{"time": 0.0, "parameters": default_parameters}]
+        default_sections["sections"].append(default_section)
+
+    # Groups the database by kernel, vendor and device type (e.g. AMD GPU) - but not by arguments! This is to check for
+    # mis-matched arguments.
+    attributes = clblast.DEVICE_TYPE_ATTRIBUTES + clblast.KERNEL_ATTRIBUTES + ["kernel"]
+    group_identifiers = set_identifiers(default_sections, attributes, "temp_identifier")
+    groups = get_groups_by_identifier(default_sections, group_identifiers, "temp_identifier")
+    for group, group_identifier in groups:
+        if len(group) != 1:
+            print("[ERROR] Entries for a single kernel with multiple argument values: " + str(group_identifier))
+        assert len(group) == 1
+    remove_identifiers(default_sections, "temp_identifier")
+
+    # Groups the database by kernel only
+    group_identifiers = set_identifiers(database, clblast.KERNEL_ATTRIBUTES + ["kernel"], "group_identifier")
+    groups = get_groups_by_identifier(database, group_identifiers, "group_identifier")
+
+    # Loops over all groups
+    for group, group_identifier in groups:
+
+        # Computes the best parameters
+        default_parameters = get_common_best_parameters(group, group_identifier, verbose)
+
+        # Stores all the section's data
+        assert len(group) > 0
+        default_section = {}
+        for attribute in group[0].keys():
+            if attribute != "results" and attribute != "group_identifier":
+                default_section[attribute] = group[0][attribute]
+        default_section = set_default_device(default_section)
+        default_section["device_vendor"] = clblast.VENDOR_DEFAULT
+        default_section["device_type"] = clblast.DEVICE_TYPE_DEFAULT
+        default_section["results"] = [{"time": 0.0, "parameters": default_parameters}]
+        default_sections["sections"].append(default_section)
 
     # Database with both types of defaults only
-    return database_defaults
+    return default_sections
 
 
-def get_smallest_best(database):
+def get_smallest_best_parameters(group):
     """Sets defaults based on the smallest values of all known entries. The average might be better for performance but
     some parameters might not be supported on other devices."""
-    database_best_results = bests.get_best_results(database)
-    return database_best_results.min(axis=0)
 
+    # Counts the number of devices in this group
+    assert len(group) > 0
+
+    # Find the smallest values of the parameters
+    min_parameters = {}
+    for section in group:
+        assert len(section["results"]) > 0
+        minimum_time = min([result["time"] for result in section["results"]])
+        for result in section["results"]:
+            if result["time"] == minimum_time:
+                for parameter in result["parameters"]:
+                    if parameter in min_parameters:
+                        min_parameters[parameter] = min(min_parameters[parameter], result["parameters"][parameter])
+                    else:
+                        min_parameters[parameter] = result["parameters"][parameter]
+
+    return min_parameters
 
-def get_common_best(database, group_name, verbose):
+
+def get_common_best_parameters(group, group_identifier, verbose):
     """Sets defaults based on the best values of entries supported by all devices. This might cause a problem in case
     not every device was tuned with the same parameters. In that case it falls back to the above method to retrieve
     the smallest best execution time"""
 
     # Counts the number of devices in this group
-    num_devices = len(database.groupby(clblast.DEVICE_ATTRIBUTES))
-
-    # Removes columns without any values
-    database = database.dropna(axis=1, how='all')
+    num_devices = len(group)
+    assert num_devices > 0
 
     # Inserts the relative execution times into the database
-    def relative_performance(x):
-        x["relative_performance"] = x["time"].min() / x["time"]
-        return x
-    database = database.groupby(clblast.ATTRIBUTES + ["kernel"]).apply(relative_performance)
-
-    # Retrieves the parameter names for this kernel
-    all_column_names = list(database.columns.values)
-    parameter_column_names = [c for c in all_column_names if "parameters." in c]
-
-    # Removes entries which are not available for all devices
-    database_by_parameters = database.groupby(parameter_column_names)
-    database_common = database_by_parameters.filter(lambda x: len(x) == num_devices)
+    for section in group:
+        assert len(section["results"]) > 0
+        minimum_time = min([result["time"] for result in section["results"]])
+        for result in section["results"]:
+            result["relative_performance"] = minimum_time / result["time"]
+
+    # Determine which parameters are available for all devices
+    common_parameters = [result["parameters"] for result in group[0]["results"]]  # Parameters of the first section
+    for i in range(1, num_devices):
+        section_parameters = [result["parameters"] for result in group[i]["results"]]
+        common_parameters = [p for p in section_parameters if p in common_parameters]  # Intersection of the parameters
 
     # Fall back to another method in case there are no shared entries at all across devices
-    if len(database_common) == 0:
+    if len(common_parameters) == 0:
         if verbose:
-            print("[database] No common kernels for: " + str(group_name) + " with devices: %d " % num_devices)
-        return get_smallest_best(database)
+            print("[database] No common kernels for: " + str(group_identifier) + " with devices: %d " % num_devices)
+        smallest_best_parameters = get_smallest_best_parameters(group)
+        if verbose:
+            print("[database] " + str(group_identifier))
+        return smallest_best_parameters
+
+    # Removes entries with parameters which are not common
+    common_results = []
+    for section in group:
+        for result in section["results"]:
+            if result["parameters"] in common_parameters:
+                common_results.append(result)
 
     # Retrieves the entries with the highest relative performance
-    return bests.get_relative_bests(database_common, parameter_column_names, group_name, verbose)
+    relative_best_parameters = bests.get_relative_bests(group_identifier, common_results, common_parameters, verbose)
+    return relative_best_parameters
diff --git a/scripts/database/database/io.py b/scripts/database/database/io.py
index ad2f7ae9..d14f1297 100644
--- a/scripts/database/database/io.py
+++ b/scripts/database/database/io.py
@@ -13,46 +13,48 @@ try:
 except ImportError:
     from urllib2 import urlopen  # Python 2
 
-import pandas as pd
-
-import clblast
-
 
 def download_database(filename, database_url):
     """Downloads a database and saves it to disk"""
     print("[database] Downloading database from '" + database_url + "'...")
     database = urlopen(database_url)
-    with open(filename, 'wb') as f:
+    with open(filename, "wb") as f:
         f.write(database.read())
 
 
 def load_database(filename):
     """Loads a database from disk"""
     print("[database] Loading database from '" + filename + "'")
-    return pd.read_pickle(filename)
+    with open(filename) as f:
+        return json.load(f)
 
 
 def save_database(database, filename):
     """Saves a database to disk"""
     print("[database] Saving database to '" + filename + "'")
-    database.to_pickle(filename)
+    with open(filename, "wb") as f:
+        json.dump(database, f, sort_keys=True, indent=4)
 
 
-def load_json_to_pandas(filename):
-    """Loads JSON data from file and converts it to a pandas database"""
+def load_tuning_results(filename):
+    """Loads JSON data from file and pre-processes it"""
     with open(filename) as f:
         json_data = json.load(f)
 
-    # Gathers all results and stores them in a new database
-    json_database = pd.DataFrame(json_data)
-    new_database = pd.io.json.json_normalize(json_database["results"])
-
-    # Sets the common attributes to each entry in the results
-    for attribute in clblast.ATTRIBUTES:
-        if attribute == "kernel_family":
-            new_database[attribute] = re.sub(r'_\d+', '', json_data[attribute])
-        elif attribute in json_data:
-            new_database[attribute] = json_data[attribute]
-        else:
-            new_database[attribute] = 0  # For example a parameters that was not used by this kernel
-    return new_database
+    # Removes the numbering following the kernel family name
+    json_data["kernel_family"] = re.sub(r'_\d+', '', json_data["kernel_family"])
+
+    # Adds the kernel name to the section instead of to the individual results
+    assert len(json_data["results"]) > 0
+    json_data["kernel"] = json_data["results"][0]["kernel"]
+    for result in json_data["results"]:
+        assert json_data["kernel"] == result["kernel"]
+        result.pop("kernel", None)
+
+    # Removes the 'PRECISION' parameter from the individual results: it is redundant
+    for result in json_data["results"]:
+        assert json_data["precision"] == str(result["parameters"]["PRECISION"])
+        result["parameters"].pop("PRECISION", None)
+
+    # All done
+    return json_data
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 6e84ca5a..9c1bcd99 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -20,12 +20,12 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",4}, {"WGS",512}, {"WPT",8} } },
-        { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index f862d00e..987a990d 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -38,11 +38,10 @@ const Database::DatabaseEntry Database::XdotSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WGS2",32} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -90,11 +89,10 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -142,7 +140,6 @@ const Database::DatabaseEntry Database::XdotDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",128} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
@@ -184,7 +181,6 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
         { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index cc81cf6a..7e793076 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -76,7 +76,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index 03e84525..e5e8845e 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -20,12 +20,12 @@ const Database::DatabaseEntry Database::XgemvHalf = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index c12fcdca..52af628c 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -20,12 +20,12 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
       }
     },
   }
-- 
cgit v1.2.3


From aa3dffe356cc3c85e4d49508a4f21f4becba6e8c Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 12 Sep 2016 20:13:38 +0200
Subject: Added XgemvFastRot and Xgemm 16-bit tuning results: just defaults
 which are now automatically taken from 32-bit if there are no entries at all

---
 scripts/database/database/clblast.py    | 18 +++++++++++++++---
 src/database/database.cpp               |  4 ++--
 src/database/database.hpp               |  4 ++--
 src/database/kernels/xgemm.hpp          | 12 ++++++++++++
 src/database/kernels/xgemv_fast_rot.hpp | 12 ++++++++++++
 5 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/scripts/database/database/clblast.py b/scripts/database/database/clblast.py
index beed46d9..8190f225 100644
--- a/scripts/database/database/clblast.py
+++ b/scripts/database/database/clblast.py
@@ -82,7 +82,7 @@ def print_cpp_database(database, output_dir):
     """Outputs the database as C++ code"""
 
     # Iterates over the kernel families
-    kernel_families = [s["kernel_family"] for s in database["sections"]]
+    kernel_families = sorted(set([s["kernel_family"] for s in database["sections"]]))
     for family_name in kernel_families:
         family_database = [s for s in database["sections"] if s["kernel_family"] == family_name]
 
@@ -92,15 +92,27 @@ def print_cpp_database(database, output_dir):
             f.write(get_cpp_header(family_name))
 
             # Loops over the different precision (e.g. 16, 32, 3232, 64, 6464)
-            precisions = sorted(set([s["precision"] for s in family_database]))
+            precisions = sorted(set([s["precision"] for s in database["sections"]]))  # Based on full database
             for precision in precisions:
                 precision_database = [s for s in family_database if s["precision"] == precision]
                 f.write(get_cpp_precision(family_name, precision))
 
-                # Loops over a combination of device vendors and device types (e.g. AMD GPU)
+                # In case there is nothing found at all (e.g. 16-bit): continue as if this was a precision of 32 but
+                # with the defaults only
+                if len(precision_database) == 0:
+                    print("[database] No results found for %s:%s, retrieving defaults from %s:32" %
+                          (family_name, precision, family_name))
+                    precision_database = [s for s in family_database if s["precision"] == "32"
+                                          and s["device_vendor"] == VENDOR_DEFAULT
+                                          and s["device_type"] == DEVICE_TYPE_DEFAULT
+                                          and s["device"] == DEVICE_NAME_DEFAULT]
+
+                # Loops over device vendors (e.g. AMD)
                 device_vendors = sorted(set([s["device_vendor"] for s in precision_database]))
                 for vendor in device_vendors:
                     vendor_database = [s for s in precision_database if s["device_vendor"] == vendor]
+
+                    # Loops over device types (e.g. GPU)
                     device_types = sorted(set([s["device_type"] for s in vendor_database]))
                     for device_type in device_types:
                         type_database = [s for s in vendor_database if s["device_type"] == device_type]
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 38974b95..34c44a29 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -35,9 +35,9 @@ const std::vector<Database::DatabaseEntry> Database::database = {
   XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
   XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
   XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
-  /* XgemvFastRotHalf, */ XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
+  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
   XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
-  /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
+  XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
   CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
   PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
   TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 8d6d3863..a6ab49c5 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -72,9 +72,9 @@ class Database {
   static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
   static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
   static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
-  static const DatabaseEntry /* XgemvFastRotHalf, */ XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
+  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
   static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
-  static const DatabaseEntry /* XgemmHalf, */ XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
+  static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
   static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
   static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
   static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 7e793076..d19c55b5 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -14,6 +14,18 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XgemmHalf = {
+  "Xgemm", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XgemmSingle = {
   "Xgemm", Precision::kSingle, {
     { // AMD GPUs
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 3d2e0d3a..328094e1 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -14,6 +14,18 @@
 namespace clblast {
 // =================================================================================================
 
+const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+  "XgemvFastRot", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
 const Database::DatabaseEntry Database::XgemvFastRotSingle = {
   "XgemvFastRot", Precision::kSingle, {
     { // AMD GPUs
-- 
cgit v1.2.3


From 4ce584a01404055fdb23f78b4ac359394b559ea1 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Mon, 12 Sep 2016 22:13:16 +0200
Subject: Split the XGEMM kernel further up: now in 3 parts. This is done
 because MSVC can't handle long strings

---
 src/kernels/level3/xgemm_part1.opencl |   2 +-
 src/kernels/level3/xgemm_part2.opencl | 208 +-----------------------------
 src/kernels/level3/xgemm_part3.opencl | 229 ++++++++++++++++++++++++++++++++++
 src/routines/level3/xgemm.cpp         |   1 +
 src/routines/level3/xher2k.cpp        |   1 +
 src/routines/level3/xherk.cpp         |   1 +
 src/routines/level3/xsyr2k.cpp        |   1 +
 src/routines/level3/xsyrk.cpp         |   1 +
 src/tuning/kernels/xgemm.cpp          |   1 +
 9 files changed, 237 insertions(+), 208 deletions(-)
 create mode 100644 src/kernels/level3/xgemm_part3.opencl

diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 1ad0a558..d0ce06ad 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -31,7 +31,7 @@
 //    o-------o        o-----o  
 //                              
 //
-// This kernel is seperated into two files. This is part 1 out of 2.
+// This kernel is seperated into three files. This is part 1 out of 3.
 //
 // =================================================================================================
 
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index faf17e49..e8234a29 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
 //
 // =================================================================================================
 
@@ -227,212 +227,6 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
 
 // =================================================================================================
 
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                      const __global realM* restrict agm, const __global realN* restrict bgm,
-                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
-                      #if SA == 1 && SB == 1
-                        , __local realM* alm, __local realN* blm
-                      #elif SA == 1
-                        , __local realM* alm
-                      #elif SB == 1
-                        , __local realN* blm
-                      #endif
-                      ) {
-
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-
-  // Combined thread identifier (volatile to disable caching)
-  #if SA == 1 || SB == 1
-    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
-  #endif
-
-  // Initializes the accumulation registers
-  InitAccRegisters(cpm);
-
-  // Loops over all workgroup tiles
-  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
-
-    // Loads data: off-chip --> local (matrix A)
-    #if SA == 1
-      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
-    #endif
-    // Loads data: off-chip --> local (matrix B)
-    #if SB == 1
-      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
-    #endif
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-
-    // Loops over all workitem tiles, unrolled by a factor KWI
-    for (int pwi=0; pwi<KWG; pwi+=KWI) {
-      #pragma unroll
-      for (int pit=0; pit<KWI; ++pit) {
-        #if SA == 0 || SB == 0
-          int idk = kwg + pwi + pit;
-        #endif
-        #if SA == 1 || SB == 1
-          int kg = pwi+pit;
-        #endif
-
-        // Loads data: local --> private (matrix A)
-        #if SA == 1
-          LocalToPrivateA(alm, apm, kg);
-        // Loads data: off-chip --> private (matrix A)
-        #else
-          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
-        #endif
-
-        // Loads data: local --> private (matrix B)
-        #if SB == 1
-          LocalToPrivateB(blm, bpm, kg);
-        // Loads data: off-chip --> private (matrix B)
-        #else
-          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
-        #endif
-
-        // Performs the accumulation (Cpm += Apm * Bpm)
-        MultiplyAccumulate(cpm, apm, bpm);
-      }
-    }
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-  }
-  #if GLOBAL_MEM_FENCE == 1
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  #endif
-}
-
-// =================================================================================================
-// The upper-triangular and lower-triangular kernels are only used in special cases
-#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-
-// Main entry point of the kernel. This is the upper-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void XgemmUpper(const int kSizeN, const int kSizeK,
-                const real_arg arg_alpha,
-                const real_arg arg_beta,
-                const __global realM* restrict agm,
-                const __global realN* restrict bgm,
-                __global realM* cgm) {
-  const real alpha = GetRealArg(arg_alpha);
-  const real beta = GetRealArg(arg_beta);
-
-  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
-    return;
-  }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// Main entry point of the kernel. This is the lower-triangular version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void XgemmLower(const int kSizeN, const int kSizeK,
-                const real_arg arg_alpha,
-                const real_arg arg_beta,
-                const __global realM* restrict agm,
-                const __global realN* restrict bgm,
-                __global realM* cgm) {
-  const real alpha = GetRealArg(arg_alpha);
-  const real beta = GetRealArg(arg_beta);
-
-  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
-    return;
-  }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// =================================================================================================
-// If not using a triangular version, include the regular kernel
-#else
-
-// Main entry point of the kernel. This is the regular full version.
-__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-           const real_arg arg_alpha,
-           const real_arg arg_beta,
-           const __global realM* restrict agm,
-           const __global realN* restrict bgm,
-           __global realM* cgm) {
-  const real alpha = GetRealArg(arg_alpha);
-  const real beta = GetRealArg(arg_beta);
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
-}
-
-#endif
-// =================================================================================================
-
 // End of the C++11 raw string literal
 )"
 
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
new file mode 100644
index 00000000..a5faef5a
--- /dev/null
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -0,0 +1,229 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                      #if SA == 1 && SB == 1
+                        , __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        , __local realM* alm
+                      #elif SB == 1
+                        , __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
+  #if SA == 1 || SB == 1
+    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+  #endif
+
+  // Initializes the accumulation registers
+  InitAccRegisters(cpm);
+
+  // Loops over all workgroup tiles
+  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+    // Loads data: off-chip --> local (matrix A)
+    #if SA == 1
+      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+    #endif
+    // Loads data: off-chip --> local (matrix B)
+    #if SB == 1
+      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+    #endif
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // Loops over all workitem tiles, unrolled by a factor KWI
+    for (int pwi=0; pwi<KWG; pwi+=KWI) {
+      #pragma unroll
+      for (int pit=0; pit<KWI; ++pit) {
+        #if SA == 0 || SB == 0
+          int idk = kwg + pwi + pit;
+        #endif
+        #if SA == 1 || SB == 1
+          int kg = pwi+pit;
+        #endif
+
+        // Loads data: local --> private (matrix A)
+        #if SA == 1
+          LocalToPrivateA(alm, apm, kg);
+        // Loads data: off-chip --> private (matrix A)
+        #else
+          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+        #endif
+
+        // Loads data: local --> private (matrix B)
+        #if SB == 1
+          LocalToPrivateB(blm, bpm, kg);
+        // Loads data: off-chip --> private (matrix B)
+        #else
+          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+        #endif
+
+        // Performs the accumulation (Cpm += Apm * Bpm)
+        MultiplyAccumulate(cpm, apm, bpm);
+      }
+    }
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+  }
+  #if GLOBAL_MEM_FENCE == 1
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  #endif
+}
+
+// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+
+// Main entry point of the kernel. This is the upper-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+           const real_arg arg_alpha,
+           const real_arg arg_beta,
+           const __global realM* restrict agm,
+           const __global realN* restrict bgm,
+           __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index fce59622..0b8e768f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -34,6 +34,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_hermitian.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index 1ba6080f..ba770065 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -31,6 +31,7 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 0fa1b7b1..3063f3bc 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -31,6 +31,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 5a90a5a2..158cd9e5 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -31,6 +31,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index 46b96b76..e1a72ef6 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -31,6 +31,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 7c9ac76a..4cb7fd00 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -35,6 +35,7 @@ class TuneXgemm {
       #include "../src/kernels/common.opencl"
       #include "../src/kernels/level3/xgemm_part1.opencl"
       #include "../src/kernels/level3/xgemm_part2.opencl"
+      #include "../src/kernels/level3/xgemm_part3.opencl"
     ;
   }
 
-- 
cgit v1.2.3


From 9095537a6a5199e2dc3824381481af790d4e8f27 Mon Sep 17 00:00:00 2001
From: Ivan Shapovalov <intelfx@intelfx.name>
Date: Tue, 13 Sep 2016 16:12:30 +0300
Subject: CMakeLists.txt: use -Wno-ignored-attributes to silence unfixable
 warnings

---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7393c6e7..c10f66a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,6 +75,12 @@ else()
     if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9.0)
       set(FLAGS "${FLAGS} -Wno-attributes -Wno-unused-variable")
     endif()
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0.0)
+      # GCC does not support attributes on template arguments
+      # in particular we hit this with the alignment attributes on cl_XXX types
+      # which are then used to instantiate various templates in CLBlast
+      set(FLAGS "${FLAGS} -Wno-ignored-attributes")
+    endif()
   elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang)
     set(FLAGS "${FLAGS} -Wextra -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-padded")
     set(FLAGS "${FLAGS} -Wno-missing-prototypes -Wno-float-equal -Wno-switch-enum -Wno-switch")
-- 
cgit v1.2.3


From 48ab0428cb3461676072a54571c747e83ef13772 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 13 Sep 2016 19:08:49 +0200
Subject: Renamed the DEFAULT_DEVICE and DEFAULT_PLATFORM env variables to be
 in line with recent usages of CLBLAST_DEVICE and CLBLAST_PLATFORM

---
 CMakeLists.txt | 10 +++++-----
 README.md      |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c10f66a3..ffcdf1a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,13 +197,13 @@ install(EXPORT CLBlast DESTINATION lib/cmake/CLBLast FILE CLBlastConfig.cmake)
 
 # ==================================================================================================
 
-# Sets a default platform ($DEVICEPLATFORM) and device ($DEFAULT_DEVICE) to run tuners and tests on
+# Sets a default platform ($DEVICEPLATFORM) and device ($CLBLAST_DEVICE) to run tuners and tests on
 set(DEVICEPLATFORM )
-if(DEFINED ENV{DEFAULT_DEVICE})
-  set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{DEFAULT_DEVICE})
+if(DEFINED ENV{CLBLAST_DEVICE})
+  set(DEVICEPLATFORM ${DEVICEPLATFORM} -device $ENV{CLBLAST_DEVICE})
 endif()
-if(DEFINED ENV{DEFAULT_PLATFORM})
-  set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{DEFAULT_PLATFORM})
+if(DEFINED ENV{CLBLAST_PLATFORM})
+  set(DEVICEPLATFORM ${DEVICEPLATFORM} -platform $ENV{CLBLAST_PLATFORM})
 endif()
 
 # ==================================================================================================
diff --git a/README.md b/README.md
index 1dd3ea65..51452028 100644
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ If your device is not (yet) among this list or if you want to tune CLBlast for s
 
 Note that CLBlast's tuners are based on the [CLTune auto-tuning library](https://github.com/CNugteren/CLTune), which has to be installed separately (requires version 2.3.1 or higher).
 
-Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
+Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
 
 The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
 
@@ -168,7 +168,7 @@ To build these tests, another BLAS library is needed to serve as a reference. Th
 
 Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
 
-All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `DEFAULT_DEVICE` and `DEFAULT_PLATFORM` environmental variables before running CMake.
+All tests can be run directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables before running CMake.
 
 
 Compiling the performance tests/clients (optional)
-- 
cgit v1.2.3


From 4b94afda941a86f363064ff02f97e21eb9618794 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 13 Sep 2016 19:20:39 +0200
Subject: Updated to version 0.9.0

---
 CHANGELOG      | 2 +-
 CMakeLists.txt | 2 +-
 README.md      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 10cde25d..1995dc84 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,5 @@
 
-Development version (next release)
+Version 0.9.0
 - Updated to version 6.0 of the CLCudaAPI C++11 OpenCL header
 - Improved performance significantly of rotated GEMV computations
 - Improved performance of unseen/un-tuned devices by a better default tuning parameter selection
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffcdf1a9..178ac9bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@ set(CMAKE_USER_MAKE_RULES_OVERRIDE_CXX ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cxx_fla
 # CMake project details
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 0)
-set(clblast_VERSION_MINOR 8)
+set(clblast_VERSION_MINOR 9)
 set(clblast_VERSION_PATCH 0)
 
 # Options and their default values
diff --git a/README.md b/README.md
index 51452028..b9631ea0 100644
--- a/README.md
+++ b/README.md
@@ -288,7 +288,7 @@ The contributing authors (code, pull requests, testing) so far are:
 * [Hugh Perkins](https://github.com/hughperkins)
 * [Gian-Carlo Pascutto](https://github.com/gcp)
 * [Ivan Shapovalov](https://github.com/intelfx)
-* [Dimitri VA](https://github.com/dvasschemacq)
+* [Dimitri Van Assche](https://github.com/dvasschemacq)
 
 Tuning and testing on a variety of OpenCL devices was made possible by:
 
-- 
cgit v1.2.3