67 files changed, 1831 insertions, 1062 deletions
diff --git a/src/cache.cpp b/src/cache.cpp
index cd9055d0..6080f082 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -23,6 +23,9 @@ namespace clblast {
 // Stores the compiled binary or IR in the cache
 void StoreBinaryToCache(const std::string &binary, const std::string &device_name,
                         const Precision &precision, const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Storing binary in cache\n");
+  #endif
   binary_cache_mutex_.lock();
   binary_cache_.push_back(BinaryCache{binary, device_name, precision, routine_name});
   binary_cache_mutex_.unlock();
@@ -31,8 +34,11 @@ void StoreBinaryToCache(const std::string &binary, const std::string &device_nam
 // Stores the compiled program in the cache
 void StoreProgramToCache(const Program &program, const Context &context,
                          const Precision &precision, const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Storing program in cache\n");
+  #endif
   program_cache_mutex_.lock();
-  program_cache_.push_back(ProgramCache{program, context.pointer(), precision, routine_name});
+  program_cache_.push_back(ProgramCache{program, context(), precision, routine_name});
   program_cache_mutex_.unlock();
 }
 
@@ -40,6 +46,9 @@ void StoreProgramToCache(const Program &program, const Context &context,
 // otherwise.
 const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision,
                                       const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Retrieving binary from cache\n");
+  #endif
   binary_cache_mutex_.lock();
   for (auto &cached_binary: binary_cache_) {
     if (cached_binary.MatchInCache(device_name, precision, routine_name)) {
@@ -55,9 +64,12 @@ const std::string& GetBinaryFromCache(const std::string &device_name, const Prec
 // otherwise.
 const Program& GetProgramFromCache(const Context &context, const Precision &precision,
                                    const std::string &routine_name) {
+  #ifdef VERBOSE
+    printf("[DEBUG] Retrieving program from cache\n");
+  #endif
   program_cache_mutex_.lock();
   for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
+    if (cached_program.MatchInCache(context(), precision, routine_name)) {
       program_cache_mutex_.unlock();
       return cached_program.program;
     }
@@ -85,7 +97,7 @@ bool ProgramIsInCache(const Context &context, const Precision &precision,
                       const std::string &routine_name) {
   program_cache_mutex_.lock();
   for (auto &cached_program: program_cache_) {
-    if (cached_program.MatchInCache(context.pointer(), precision, routine_name)) {
+    if (cached_program.MatchInCache(context(), precision, routine_name)) {
       program_cache_mutex_.unlock();
       return true;
     }
diff --git a/src/cache.hpp b/src/cache.hpp
index 0d74d7bc..9075da0d 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -48,14 +48,14 @@ static std::mutex binary_cache_mutex_;
 // The cache of compiled OpenCL programs, along with some meta-data
 struct ProgramCache {
   Program program;
-  ContextPointer context_ptr;
+  cl_context context;
   Precision precision;
   std::string routine_name_;
 
   // Finds out whether the properties match
-  bool MatchInCache(const ContextPointer ref_context, const Precision &ref_precision,
+  bool MatchInCache(const cl_context ref_context, const Precision &ref_precision,
                     const std::string &ref_routine) {
-    return (context_ptr == ref_context &&
+    return (context == ref_context &&
             precision == ref_precision &&
             routine_name_ == ref_routine);
   }
diff --git a/src/clblast.cpp b/src/clblast.cpp
index 88d60772..79c30ca4 100644
--- a/src/clblast.cpp
+++ b/src/clblast.cpp
@@ -16,7 +16,6 @@
 #include <string>
 
 #include "clblast.h"
-#include "public_api.hpp"
 #include "cache.hpp"
 
 // BLAS level-1 includes
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index b834d8b4..d57223dd 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -72,15 +72,24 @@ inline void CheckError(const cl_int status) {
 class Event {
  public:
 
-  // Constructor based on the regular OpenCL data-type
-  explicit Event(const cl_event event): event_(event) { }
+  // Constructor based on the regular OpenCL data-type: memory management is handled elsewhere
+  explicit Event(const cl_event event):
+      event_(new cl_event) {
+    *event_ = event;
+  }
 
-  // Regular constructor
-  explicit Event(): event_(nullptr) { }
+  // Regular constructor with memory management
+  explicit Event():
+      event_(new cl_event, [](cl_event* e) {
+        if (*e) { CheckError(clReleaseEvent(*e)); }
+        delete e;
+      }) {
+    *event_ = nullptr;
+  }
 
   // Waits for completion of this event
   void WaitForCompletion() const {
-    CheckError(clWaitForEvents(1, &event_));
+    CheckError(clWaitForEvents(1, &(*event_)));
   }
 
   // Retrieves the elapsed time of the last recorded event. Note that no error checking is done on
@@ -89,20 +98,22 @@ class Event {
   float GetElapsedTime() const {
     WaitForCompletion();
     auto bytes = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
     auto time_start = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
     auto time_end = size_t{0};
-    clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
+    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
     return (time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
-  cl_event& operator()() { return event_; }
-  cl_event* pointer() { return &event_; }
+  cl_event& operator()() { return *event_; }
+  const cl_event& operator()() const { return *event_; }
+  cl_event* pointer() { return &(*event_); }
+  const cl_event* pointer() const { return &(*event_); }
  private:
-  cl_event event_;
+  std::shared_ptr<cl_event> event_;
 };
 
 // Pointer to an OpenCL event
@@ -163,6 +174,15 @@ class Device {
 
   // Methods to retrieve device information
   std::string Version() const { return GetInfoString(CL_DEVICE_VERSION); }
+  size_t VersionNumber() const
+  {
+    std::string version_string = Version().substr(7);
+    // Space separates the end of the OpenCL version number from the beginning of the
+    // vendor-specific information.
+    size_t next_whitespace = version_string.find(' ');
+    size_t version = (size_t) (100.0 * std::stod(version_string.substr(0, next_whitespace)));
+    return version;
+  }
   std::string Vendor() const { return GetInfoString(CL_DEVICE_VENDOR); }
   std::string Name() const { return GetInfoString(CL_DEVICE_NAME); }
   std::string Type() const {
@@ -176,24 +196,32 @@ class Device {
   }
   size_t MaxWorkGroupSize() const { return GetInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); }
   size_t MaxWorkItemDimensions() const {
-    return GetInfo(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS));
   }
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  size_t LocalMemSize() const {
-    return static_cast<size_t>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
+  cl_ulong LocalMemSize() const {
+    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
-  size_t CoreClock() const { return GetInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY); }
-  size_t ComputeUnits() const { return GetInfo(CL_DEVICE_MAX_COMPUTE_UNITS); }
-  size_t MemorySize() const { return GetInfo(CL_DEVICE_GLOBAL_MEM_SIZE); }
-  size_t MaxAllocSize() const { return GetInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE); }
+  size_t CoreClock() const {
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_CLOCK_FREQUENCY));
+  }
+  size_t ComputeUnits() const {
+    return static_cast<size_t>(GetInfo<cl_uint>(CL_DEVICE_MAX_COMPUTE_UNITS));
+  }
+  unsigned long MemorySize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_GLOBAL_MEM_SIZE));
+  }
+  unsigned long MaxAllocSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_MAX_MEM_ALLOC_SIZE));
+  }
   size_t MemoryClock() const { return 0; } // Not exposed in OpenCL
   size_t MemoryBusWidth() const { return 0; } // Not exposed in OpenCL
 
   // Configuration-validity checks
-  bool IsLocalMemoryValid(const size_t local_mem_usage) const {
+  bool IsLocalMemoryValid(const cl_ulong local_mem_usage) const {
     return (local_mem_usage <= LocalMemSize());
   }
   bool IsThreadConfigValid(const std::vector<size_t> &local) const {
@@ -211,6 +239,8 @@ class Device {
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
   bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
+  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -227,13 +257,6 @@ class Device {
     CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
     return result;
   }
-  size_t GetInfo(const cl_device_info info) const {
-    auto bytes = size_t{0};
-    CheckError(clGetDeviceInfo(device_, info, 0, nullptr, &bytes));
-    auto result = cl_uint(0);
-    CheckError(clGetDeviceInfo(device_, info, bytes, &result, nullptr));
-    return static_cast<size_t>(result);
-  }
   template <typename T>
   std::vector<T> GetInfoVector(const cl_device_info info) const {
     auto bytes = size_t{0};
@@ -386,8 +409,16 @@ class Queue {
                                                              delete s; }) {
     auto status = CL_SUCCESS;
     #ifdef CL_VERSION_2_0
-      cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
-      *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+      size_t ocl_version = device.VersionNumber();
+      if (ocl_version >= 200)
+      {
+        cl_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
+        *queue_ = clCreateCommandQueueWithProperties(context(), device(), properties, &status);
+      }
+      else
+      {
+        *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
+      }
     #else
       *queue_ = clCreateCommandQueue(context(), device(), CL_QUEUE_PROFILING_ENABLE, &status);
     #endif
@@ -627,15 +658,25 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  size_t LocalMemUsage(const Device &device) const {
+  cl_ulong LocalMemUsage(const Device &device) const {
     auto bytes = size_t{0};
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
-    auto result = size_t{0};
+    auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
     return result;
   }
 
+  // Retrieves the name of the kernel
+  std::string GetFunctionName() {
+    auto bytes = size_t{0};
+    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
+    auto result = std::string{};
+    result.resize(bytes);
+    CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, bytes, &result[0], nullptr));
+    return std::string{result.c_str()}; // Removes any trailing '\0'-characters
+  }
+
   // Launches a kernel onto the specified queue
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event) {
@@ -647,30 +688,21 @@ class Kernel {
   // As above, but with an event waiting list
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
-              std::vector<Event>& waitForEvents) {
-    if (waitForEvents.size() == 0) { return Launch(queue, global, local, event); }
-
+              const std::vector<Event> &waitForEvents) {
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
-      waitForEventsPlain.push_back(waitEvent());
+      if (waitEvent()) { waitForEventsPlain.push_back(waitEvent()); }
     }
 
     // Launches the kernel while waiting for other events
     CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
-                                      nullptr, global.data(), local.data(),
+                                      nullptr, global.data(), !local.empty() ? local.data() : nullptr,
                                       static_cast<cl_uint>(waitForEventsPlain.size()),
-                                      waitForEventsPlain.data(),
+                                      !waitForEventsPlain.empty() ? waitForEventsPlain.data() : nullptr,
                                       event));
   }
 
-  // As above, but with the default local workgroup size
-  void Launch(const Queue &queue, const std::vector<size_t> &global, EventPointer event) {
-    CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()),
-                                      nullptr, global.data(), nullptr,
-                                      0, nullptr, event));
-  }
-
   // Accessor to the private data-member
   const cl_kernel& operator()() const { return *kernel_; }
  private:
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 6ec93731..34c44a29 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -17,6 +17,8 @@
 #include "database/kernels/xaxpy.hpp"
 #include "database/kernels/xdot.hpp"
 #include "database/kernels/xgemv.hpp"
+#include "database/kernels/xgemv_fast.hpp"
+#include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
 #include "database/kernels/xgemm.hpp"
 #include "database/kernels/copy.hpp"
@@ -32,6 +34,8 @@ const std::vector<Database::DatabaseEntry> Database::database = {
   XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
   XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
   XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
+  XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
+  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
   XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
   XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
   CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
@@ -42,9 +46,10 @@ const std::vector<Database::DatabaseEntry> Database::database = {
 
 // =================================================================================================
 
-// Constructor, computing device properties and populating the parameter-vector from the database
+// Constructor, computing device properties and populating the parameter-vector from the database.
+// This takes an optional overlay database in case of custom tuning or custom kernels.
 Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision):
+                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
   parameters_{} {
 
   // Finds information of the current device
@@ -53,10 +58,26 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
   auto device_vendor = device.Vendor();
   auto device_name = device.Name();
 
+  // Set the short vendor name
+  for (auto &combination : kVendorNames) {
+    if (device_vendor == combination.first) {
+      device_vendor = combination.second;
+    }
+  }
+
   // Iterates over all kernels to include, and retrieves the parameters for each of them
   for (auto &kernel: kernels) {
-    auto search_result = Search(kernel, device_type, device_vendor, device_name, precision);
-    parameters_.insert(search_result.begin(), search_result.end());
+    auto search_result = ParametersPtr{};
+
+    for (auto db: { &overlay, &database }) {
+      search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
+      if (search_result) {
+        parameters_.insert(search_result->begin(), search_result->end());
+        break;
+      }
+    }
+
+    if (!search_result) { throw std::runtime_error("Database error, could not find a suitable entry"); }
   }
 }
 
@@ -73,28 +94,22 @@ std::string Database::GetDefines() const {
 
 // =================================================================================================
 
-// Searches the database for the right kernel and precision
-Database::Parameters Database::Search(const std::string &this_kernel,
-                                      const std::string &this_type,
-                                      const std::string &this_vendor,
-                                      const std::string &this_device,
-                                      const Precision this_precision) const {
-  // Set the short vendor name
-  auto this_short_vendor = this_vendor;
-  for (auto &combination : kVendorNames) {
-    if (this_vendor == combination.first) {
-      this_short_vendor = combination.second;
-    }
-  }
+// Searches a particular database for the right kernel and precision
+Database::ParametersPtr Database::Search(const std::string &this_kernel,
+                                         const std::string &this_type,
+                                         const std::string &this_vendor,
+                                         const std::string &this_device,
+                                         const Precision this_precision,
+                                         const std::vector<DatabaseEntry> &this_database) const {
 
   // Selects the right kernel
-  for (auto &db: database) {
+  for (auto &db: this_database) {
     if (db.kernel == this_kernel && db.precision == this_precision) {
 
       // Searches for the right vendor and device type, or selects the default if unavailable. This
       // assumes that the default vendor / device type is last in the database.
       for (auto &vendor: db.vendors) {
-        if ((vendor.name == this_short_vendor || vendor.name == kDeviceVendorAll) &&
+        if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
             (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
 
           // Searches for the right device. If the current device is unavailable, selects the vendor
@@ -104,7 +119,7 @@ Database::Parameters Database::Search(const std::string &this_kernel,
             if (device.name == this_device || device.name == "default") {
 
               // Sets the parameters accordingly
-              return device.parameters;
+              return &device.parameters;
             }
           }
         }
@@ -112,8 +127,8 @@ Database::Parameters Database::Search(const std::string &this_kernel,
     }
   }
 
-  // If we reached this point, something is wrong
-  throw std::runtime_error("Database error, could not find a suitable entry");
+  // If we reached this point, the entry was not found in this database
+  return nullptr;
 }
 
 // =================================================================================================
diff --git a/src/database/database.hpp b/src/database/database.hpp
index 0987cbed..a6ab49c5 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -32,6 +32,7 @@ class Database {
 
   // Type alias for the database parameters
   using Parameters = std::unordered_map<std::string,size_t>;
+  using ParametersPtr = const Parameters*;
 
   // Structures for content inside the database
   struct DatabaseDevice {
@@ -70,6 +71,8 @@ class Database {
   static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
   static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
   static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
+  static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
+  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
   static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
   static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
   static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
@@ -78,9 +81,9 @@ class Database {
   static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
   static const std::vector<DatabaseEntry> database;
 
-  // The constructor
+  // The constructor with a user-provided database overlay (potentially an empty vector)
   explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision);
+                    const Precision precision, const std::vector<DatabaseEntry> &overlay);
 
   // Accessor of values by key
   size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@@ -89,9 +92,10 @@ class Database {
   std::string GetDefines() const;
 
  private:
-  Parameters Search(const std::string &this_kernel, const std::string &this_type,
-                    const std::string &this_vendor, const std::string &this_device,
-                    const Precision this_precision) const;
+  // Search method for a specified database, returning pointer (possibly a nullptr)
+  ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
+                       const std::string &this_vendor, const std::string &this_device,
+                       const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
 
   // Found parameters suitable for this device/kernel
   Parameters parameters_;
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index 14946af4..a6b7dfe8 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
   "Copy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
       }
@@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::CopySingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
+        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -59,11 +61,13 @@ const Database::DatabaseEntry Database::CopySingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::CopySingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
-        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
       }
     },
   }
@@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Intel CPUs
@@ -112,16 +120,18 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
       }
     },
     { // Intel accelerators
@@ -133,18 +143,21 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
   }
@@ -158,9 +171,10 @@ const Database::DatabaseEntry Database::CopyDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
+        { "Oland",                                           { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
         { "Pitcairn",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -174,7 +188,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -186,20 +200,23 @@ const Database::DatabaseEntry Database::CopyDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
   }
@@ -213,9 +230,10 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Hawaii",                                          { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",8} } },
+        { "Oland",                                           { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Pitcairn",                                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tahiti",                                          { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -229,7 +247,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",8}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",8}, {"COPY_WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -241,8 +259,11 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",4} } },
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -254,7 +275,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index db4df9f0..3cfabaf4 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadHalf = {
   "Pad", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
@@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::PadSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Pitcairn",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // ARM GPUs
@@ -54,16 +56,18 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::PadSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
   }
@@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // ARM GPUs
@@ -118,16 +126,18 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
       }
     },
     { // Intel accelerators
@@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
@@ -166,9 +179,10 @@ const Database::DatabaseEntry Database::PadDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tahiti",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // ARM GPUs
@@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::PadDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -194,20 +208,23 @@ const Database::DatabaseEntry Database::PadDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
@@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Hawaii",                                          { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Oland",                                           { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Pitcairn",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tahiti",                                          { {"PAD_DIMX",8}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // ARM GPUs
@@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
       }
     },
     { // Intel accelerators
@@ -249,20 +267,23 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 1070",                                { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 480",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
   }
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 7fedd15a..88bd4ea7 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
   "Padtranspose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
       }
@@ -38,6 +39,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
@@ -54,11 +56,13 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",0}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
   }
@@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -123,11 +131,13 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
     { // Intel accelerators
@@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
   }
@@ -166,9 +179,10 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -194,20 +208,23 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
       }
     },
   }
@@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Hawaii",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
+        { "Oland",                                           { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Pitcairn",                                        { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Tahiti",                                          { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
-        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -249,20 +267,23 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "Tesla K20m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "Tesla K40m",                                      { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",1} } },
+        { "default",                                         { {"PADTRA_PAD",0}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
       }
     },
   }
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 4229e39f..0e1b608e 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
   "Transpose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
@@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::TransposeSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
         { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",8} } },
+        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Pitcairn",                                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -59,11 +61,13 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // Intel accelerators
@@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::TransposeSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K20m",                                      { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Tesla K40m",                                      { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
   }
@@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Oland",                                           { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
     { // ARM GPUs
@@ -118,35 +126,40 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
   }
@@ -160,9 +173,10 @@ const Database::DatabaseEntry Database::TransposeDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Hawaii",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "Oland",                                           { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Tahiti",                                          { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
       }
     },
     { // ARM GPUs
@@ -176,7 +190,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",8} } },
       }
     },
     { // Intel accelerators
@@ -188,20 +202,23 @@ const Database::DatabaseEntry Database::TransposeDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
   }
@@ -215,9 +232,10 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Hawaii",                                          { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
+        { "Oland",                                           { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Pitcairn",                                        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "Tahiti",                                          { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -231,26 +249,29 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"TRA_DIM",4}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 1070",                                { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 480",                                 { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K20m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
         { "Tesla K40m",                                      { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",4}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index d8088ca2..9c1bcd99 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
   "Xaxpy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",4}, {"WGS",512}, {"WPT",8} } },
-        { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",4}, {"WGS",512}, {"WPT",8} } },
+        { "default",                                         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
       }
     },
   }
@@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::XaxpySingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",2} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -54,12 +56,14 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",2}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",8}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -75,20 +79,23 @@ const Database::DatabaseEntry Database::XaxpySingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",2}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 480",                                 { {"VW",4}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 680",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",4} } },
+        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",128}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",4}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",64}, {"WPT",1} } },
       }
     },
   }
@@ -102,9 +109,10 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",2}, {"WGS",64}, {"WPT",8} } },
         { "Hawaii",                                          { {"VW",1}, {"WGS",128}, {"WPT",2} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -118,16 +126,18 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",1}, {"WGS",1024}, {"WPT",2} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",1024}, {"WPT",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",2} } },
       }
     },
     { // Intel accelerators
@@ -139,20 +149,23 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",512}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",256}, {"WPT",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
   }
@@ -166,6 +179,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",1}, {"WGS",64}, {"WPT",2} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -182,7 +196,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",8}, {"WGS",2048}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -194,15 +208,18 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 480",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",8} } },
+        { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
         { "Tesla K20m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Tesla K40m",                                      { {"VW",2}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // Default
@@ -221,9 +238,10 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Hawaii",                                          { {"VW",2}, {"WGS",64}, {"WPT",1} } },
+        { "Oland",                                           { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Pitcairn",                                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Tahiti",                                          { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -237,7 +255,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW",8}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW",8}, {"WGS",512}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW",1}, {"WGS",256}, {"WPT",1} } },
-        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "default",                                         { {"VW",4}, {"WGS",1024}, {"WPT",1} } },
       }
     },
     { // Intel accelerators
@@ -249,8 +267,11 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 1070",                                { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
+        { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
@@ -262,7 +283,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "default",                                         { {"VW",1}, {"WGS",128}, {"WPT",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 48288f95..987a990d 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
   "Xdot", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
       }
@@ -37,7 +38,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WGS2",32} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",128}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
@@ -51,26 +52,31 @@ const Database::DatabaseEntry Database::XdotSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",1024} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
+        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",256} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
   }
@@ -83,10 +89,10 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",128}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -97,6 +103,8 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
@@ -106,17 +114,20 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",64} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
   }
@@ -129,10 +140,10 @@ const Database::DatabaseEntry Database::XdotDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",128} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -144,17 +155,20 @@ const Database::DatabaseEntry Database::XdotDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",512} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
       }
     },
   }
@@ -167,10 +181,10 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",32} } },
-        { "Hawaii",                                          { {"WGS1",256}, {"WGS2",32} } },
+        { "Oland",                                           { {"WGS1",256}, {"WGS2",32} } },
         { "Pitcairn",                                        { {"WGS1",256}, {"WGS2",32} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",32} } },
       }
     },
     { // Intel CPUs
@@ -182,17 +196,20 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 480",                                 { {"WGS1",512}, {"WGS2",32} } },
+        { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",64} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",64} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index 27cebc8a..d19c55b5 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -18,7 +18,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {
   "Xgemm", Precision::kHalf, {
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
   }
@@ -32,6 +32,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",8} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
+        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -53,11 +54,13 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Intel accelerators
@@ -69,20 +72,23 @@ const Database::DatabaseEntry Database::XgemmSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",1} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
+        { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
         { "Tesla K20m",                                      { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Tesla K40m",                                      { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
   }
@@ -96,6 +102,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
         { "Hawaii",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "Oland",                                           { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -117,11 +124,13 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // Intel accelerators
@@ -133,8 +142,11 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -158,11 +170,12 @@ const Database::DatabaseEntry Database::XgemmDouble = {
   "Xgemm", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "Tahiti",                                          { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
-        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
     { // ARM GPUs
@@ -188,8 +201,11 @@ const Database::DatabaseEntry Database::XgemmDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
+        { "GeForce GTX 1070",                                { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",8} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
@@ -215,6 +231,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "Hawaii",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
+        { "Oland",                                           { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "Pitcairn",                                        { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tahiti",                                          { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -243,8 +260,11 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 1070",                                { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 480",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
+        { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index ce258f2f..e5e8845e 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -18,13 +18,14 @@ const Database::DatabaseEntry Database::XgemvHalf = {
   "Xgemv", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
@@ -36,52 +37,58 @@ const Database::DatabaseEntry Database::XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",128}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",128}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",4}, {"WGS3",256}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Iris",                                            { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",8} } },
-        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
+        { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
+        { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
-        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
+        { "Tesla K20m",                                      { {"WGS1",128}, {"WPT1",1} } },
+        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
@@ -93,48 +100,54 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
   "Xgemv", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",256}, {"WPT2",2}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",64}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",64}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",2}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1}, {"VW2",2}, {"WGS2",128}, {"WPT2",2}, {"VW3",4}, {"WGS3",128}, {"WPT3",4} } },
-        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",4}, {"WGS3",64}, {"WPT3",4} } },
-        { "Iris",                                            { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
+        { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
+        { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GRID K520",                                       { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
@@ -146,43 +159,47 @@ const Database::DatabaseEntry Database::XgemvDouble = {
   "Xgemv", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",2}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",128}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",256}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2}, {"VW2",4}, {"WGS2",128}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",2}, {"VW2",1}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",2}, {"WGS3",128}, {"WPT3",2} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
-        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 1070",                                { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
+        { "Tesla K20m",                                      { {"WGS1",256}, {"WPT1",1} } },
+        { "Tesla K40m",                                      { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
   }
@@ -194,36 +211,38 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
   "Xgemv", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
-        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",256}, {"WPT2",1}, {"VW3",1}, {"WGS3",128}, {"WPT3",1} } },
-        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WPT1",1} } },
+        { "Hawaii",                                          { {"WGS1",64}, {"WPT1",1} } },
+        { "Oland",                                           { {"WGS1",256}, {"WPT1",1} } },
+        { "Pitcairn",                                        { {"WGS1",256}, {"WPT1",1} } },
+        { "Tahiti",                                          { {"WGS1",256}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // Intel CPUs
       kDeviceTypeCPU, "Intel", {
-        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4}, {"VW2",4}, {"WGS2",64}, {"WPT2",4}, {"VW3",2}, {"WGS3",256}, {"WPT3",2} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",2}, {"WGS2",64}, {"WPT2",4}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"WGS1",64}, {"WPT1",4} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",4} } },
       }
     },
     { // Intel accelerators
       kDeviceTypeAccelerator, "Intel", {
-        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"WGS1",64}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
-        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1}, {"VW2",1}, {"WGS2",128}, {"WPT2",1}, {"VW3",1}, {"WGS3",256}, {"WPT3",1} } },
-        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "GRID K520",                                       { {"WGS1",128}, {"WPT1",1} } },
+        { "GeForce GTX 480",                                 { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
+        { "default",                                         { {"WGS1",128}, {"WPT1",1} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",64}, {"WPT1",1}, {"VW2",1}, {"WGS2",64}, {"WPT2",1}, {"VW3",1}, {"WGS3",64}, {"WPT3",1} } },
+        { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
     },
   }
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
new file mode 100644
index 00000000..52af628c
--- /dev/null
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -0,0 +1,250 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastHalf = {
+  "XgemvFast", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",16}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastSingle = {
+  "XgemvFast", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 750",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
+  "XgemvFast", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastDouble = {
+  "XgemvFast", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 1070",                                { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 750",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Tesla K20m",                                      { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "Tesla K40m",                                      { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
+  "XgemvFast", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Hawaii",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Oland",                                           { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "Pitcairn",                                        { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "Tahiti",                                          { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+        { "Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz",         { {"VW2",4}, {"WGS2",64}, {"WPT2",4} } },
+        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",4} } },
+      }
+    },
+    { // Intel accelerators
+      kDeviceTypeAccelerator, "Intel", {
+        { "Intel(R) Many Integrated Core Acceleration Card", { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GRID K520",                                       { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
+        { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
new file mode 100644
index 00000000..328094e1
--- /dev/null
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -0,0 +1,154 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemv_Fast_Rot' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+  "XgemvFastRot", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotSingle = {
+  "XgemvFastRot", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
+  "XgemvFastRot", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
+        { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
+        { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotDouble = {
+  "XgemvFastRot", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",8} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
+  "XgemvFastRot", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
+      }
+    },
+    { // Intel CPUs
+      kDeviceTypeCPU, "Intel", {
+        { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",16}, {"WPT3",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3727cc57..3e9c25c1 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -18,6 +18,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
   "Xger", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
       }
@@ -38,9 +39,10 @@ const Database::DatabaseEntry Database::XgerSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
+        { "Oland",                                           { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -53,29 +55,34 @@ const Database::DatabaseEntry Database::XgerSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",128}, {"WGS2",2}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",8}, {"WPT",4} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
+        { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
       }
     },
   }
@@ -89,9 +96,10 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
+        { "Oland",                                           { {"WGS1",4}, {"WGS2",8}, {"WPT",1} } },
         { "Pitcairn",                                        { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -104,29 +112,34 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",4}, {"WPT",2} } },
       }
     },
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
-        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",4}, {"WPT",1} } },
+        { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
+        { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",16}, {"WGS2",64}, {"WPT",2} } },
         { "GeForce GTX 480",                                 { {"WGS1",128}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",2}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",4}, {"WPT",2} } },
       }
     },
   }
@@ -140,9 +153,10 @@ const Database::DatabaseEntry Database::XgerDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "Oland",                                           { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -155,21 +169,24 @@ const Database::DatabaseEntry Database::XgerDouble = {
       kDeviceTypeCPU, "Intel", {
         { "Intel(R) Core(TM) i5-6200U CPU @ 2.30GHz",        { {"WGS1",512}, {"WGS2",16}, {"WPT",1} } },
         { "Intel(R) Core(TM) i7-5930K CPU @ 3.50GHz",        { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",512}, {"WGS2",8}, {"WPT",2} } },
       }
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",128}, {"WGS2",8}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",32}, {"WGS2",8}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
+        { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",16}, {"WGS2",4}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",16}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
   }
@@ -183,9 +200,10 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
       kDeviceTypeGPU, "AMD", {
         { "AMD Radeon R9 M370X Compute Engine",              { {"WGS1",64}, {"WGS2",1}, {"WPT",1} } },
         { "Hawaii",                                          { {"WGS1",128}, {"WGS2",1}, {"WPT",1} } },
+        { "Oland",                                           { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "Pitcairn",                                        { {"WGS1",64}, {"WGS2",4}, {"WPT",1} } },
         { "Tahiti",                                          { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
-        { "default",                                         { {"WGS1",32}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",1} } },
       }
     },
     { // ARM GPUs
@@ -204,15 +222,18 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
         { "GRID K520",                                       { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
+        { "GeForce GTX 1070",                                { {"WGS1",8}, {"WGS2",128}, {"WPT",1} } },
         { "GeForce GTX 480",                                 { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
+        { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",8}, {"WGS2",2}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"WGS1",8}, {"WGS2",1}, {"WPT",1} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
   }
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 08c47d87..223501fd 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -109,6 +109,16 @@ R"(
   typedef real singlereal;
 #endif
 
+// Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
+// conversion, but half-precision is not supported as kernel argument so it is converted from float.
+#if PRECISION == 16
+  typedef float real_arg;
+  #define GetRealArg(x) (half)x
+#else
+  typedef real real_arg;
+  #define GetRealArg(x) x
+#endif
+
 // =================================================================================================
 
 // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
@@ -138,6 +148,13 @@ R"(
   #define SetToOne(a) a = ONE
 #endif
 
+// Determines whether a variable is zero
+#if PRECISION == 3232 || PRECISION == 6464
+  #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
+#else
+  #define IsZero(a) (a == ZERO)
+#endif
+
 // The absolute value (component-wise)
 #if PRECISION == 3232 || PRECISION == 6464
   #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index 48d0eb5c..48ad2e75 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xamax(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global singlereal* maxgm, __global unsigned int* imaxgm) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xamax(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global singlereal* maxgm, __global unsigned int* imaxgm) {
   __local singlereal maxlm[WGS1];
   __local unsigned int imaxlm[WGS1];
   const int lid = get_local_id(0);
@@ -95,10 +95,10 @@ __kernel void Xamax(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XamaxEpilogue(const __global singlereal* restrict maxgm,
-                            const __global unsigned int* restrict imaxgm,
-                            __global unsigned int* imax, const int imax_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XamaxEpilogue(const __global singlereal* restrict maxgm,
+                   const __global unsigned int* restrict imaxgm,
+                   __global unsigned int* imax, const int imax_offset) {
   __local singlereal maxlm[WGS2];
   __local unsigned int imaxlm[WGS2];
   const int lid = get_local_id(0);
diff --git a/src/kernels/level1/xasum.opencl b/src/kernels/level1/xasum.opencl
index 58d0f11b..1fc91be8 100644
--- a/src/kernels/level1/xasum.opencl
+++ b/src/kernels/level1/xasum.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the loading and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xasum(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xasum(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* output) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -74,9 +74,9 @@ __kernel void Xasum(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XasumEpilogue(const __global real* restrict input,
-                            __global real* asum, const int asum_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XasumEpilogue(const __global real* restrict input,
+                   __global real* asum, const int asum_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xaxpy.opencl b/src/kernels/level1/xaxpy.opencl
index e0efadc1..ece8476e 100644
--- a/src/kernels/level1/xaxpy.opencl
+++ b/src/kernels/level1/xaxpy.opencl
@@ -22,11 +22,11 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xaxpy(const int n, const real_arg arg_alpha,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,11 +40,11 @@ __kernel void Xaxpy(const int n, const __constant real* restrict arg_alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XaxpyFast(const int n, const __constant real* restrict arg_alpha,
-                        const __global realV* restrict xgm,
-                        __global realV* ygm) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XaxpyFast(const int n, const real_arg arg_alpha,
+               const __global realV* restrict xgm,
+               __global realV* ygm) {
+  const real alpha = GetRealArg(arg_alpha);
 
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
diff --git a/src/kernels/level1/xcopy.opencl b/src/kernels/level1/xcopy.opencl
index 97c27ccf..228e0735 100644
--- a/src/kernels/level1/xcopy.opencl
+++ b/src/kernels/level1/xcopy.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xcopy(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xcopy(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -38,10 +38,10 @@ __kernel void Xcopy(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XcopyFast(const int n,
-                        const __global realV* restrict xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XcopyFast(const int n,
+               const __global realV* restrict xgm,
+               __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xdot.opencl b/src/kernels/level1/xdot.opencl
index e13eb3c1..02f04ea7 100644
--- a/src/kernels/level1/xdot.opencl
+++ b/src/kernels/level1/xdot.opencl
@@ -30,11 +30,11 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the sum operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xdot(const int n,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   const __global real* restrict ygm, const int y_offset, const int y_inc,
-                   __global real* output, const int do_conjugate) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xdot(const int n,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* restrict ygm, const int y_offset, const int y_inc,
+          __global real* output, const int do_conjugate) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -73,9 +73,9 @@ __kernel void Xdot(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XdotEpilogue(const __global real* restrict input,
-                           __global real* dot, const int dot_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XdotEpilogue(const __global real* restrict input,
+                  __global real* dot, const int dot_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xnrm2.opencl b/src/kernels/level1/xnrm2.opencl
index 9803687a..f6d869cb 100644
--- a/src/kernels/level1/xnrm2.opencl
+++ b/src/kernels/level1/xnrm2.opencl
@@ -30,10 +30,10 @@ R"(
 // =================================================================================================
 
 // The main reduction kernel, performing the multiplication and the majority of the operation
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xnrm2(const int n,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    __global real* output) {
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xnrm2(const int n,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           __global real* output) {
   __local real lm[WGS1];
   const int lid = get_local_id(0);
   const int wgid = get_group_id(0);
@@ -72,9 +72,9 @@ __kernel void Xnrm2(const int n,
 
 // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 // be launched with a single workgroup only.
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void Xnrm2Epilogue(const __global real* restrict input,
-                            __global real* nrm2, const int nrm2_offset) {
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void Xnrm2Epilogue(const __global real* restrict input,
+                   __global real* nrm2, const int nrm2_offset) {
   __local real lm[WGS2];
   const int lid = get_local_id(0);
 
diff --git a/src/kernels/level1/xscal.opencl b/src/kernels/level1/xscal.opencl
index 59936776..3da9c2fd 100644
--- a/src/kernels/level1/xscal.opencl
+++ b/src/kernels/level1/xscal.opencl
@@ -22,9 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xscal(const int n, const real alpha,
-                    __global real* xgm, const int x_offset, const int x_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xscal(const int n, const real_arg arg_alpha,
+           __global real* xgm, const int x_offset, const int x_inc) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,9 +41,11 @@ __kernel void Xscal(const int n, const real alpha,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XscalFast(const int n, const real alpha,
-                        __global realV* xgm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XscalFast(const int n, const real_arg arg_alpha,
+               __global realV* xgm) {
+  const real alpha = GetRealArg(arg_alpha);
+
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level1/xswap.opencl b/src/kernels/level1/xswap.opencl
index f6487b58..267271c0 100644
--- a/src/kernels/level1/xswap.opencl
+++ b/src/kernels/level1/xswap.opencl
@@ -22,10 +22,10 @@ R"(
 // =================================================================================================
 
 // Full version of the kernel with offsets and strided accesses
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void Xswap(const int n,
-                    __global real* xgm, const int x_offset, const int x_inc,
-                    __global real* ygm, const int y_offset, const int y_inc) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void Xswap(const int n,
+           __global real* xgm, const int x_offset, const int x_inc,
+           __global real* ygm, const int y_offset, const int y_inc) {
 
   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
   #pragma unroll
@@ -40,10 +40,10 @@ __kernel void Xswap(const int n,
 
 // Faster version of the kernel without offsets and strided accesses. Also assumes that 'n' is
 // dividable by 'VW', 'WGS' and 'WPT'.
-__attribute__((reqd_work_group_size(WGS, 1, 1)))
-__kernel void XswapFast(const int n,
-                        __global realV* xgm,
-                        __global realV* ygm) {
+__kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
+void XswapFast(const int n,
+               __global realV* xgm,
+               __global realV* ygm) {
   #pragma unroll
   for (int w=0; w<WPT; ++w) {
     const int id = w*get_global_size(0) + get_global_id(0);
diff --git a/src/kernels/level2/xgemv.opencl b/src/kernels/level2/xgemv.opencl
index 65b4291f..ff011acd 100644
--- a/src/kernels/level2/xgemv.opencl
+++ b/src/kernels/level2/xgemv.opencl
@@ -210,18 +210,18 @@ inline real LoadMatrixA(const __global real* restrict agm, const int x, const in
 // =================================================================================================
 
 // Full version of the kernel
-__attribute__((reqd_work_group_size(WGS1, 1, 1)))
-__kernel void Xgemv(const int m, const int n,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
+__kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
+void Xgemv(const int m, const int n,
+                    const real_arg arg_alpha,
+                    const real_arg arg_beta,
                     const int a_rotated,
                     const __global real* restrict agm, const int a_offset, const int a_ld,
                     const __global real* restrict xgm, const int x_offset, const int x_inc,
                     __global real* ygm, const int y_offset, const int y_inc,
                     const int do_conjugate, const int parameter,
                     const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS1];
diff --git a/src/kernels/level2/xgemv_fast.opencl b/src/kernels/level2/xgemv_fast.opencl
index 6a494e84..02a1f956 100644
--- a/src/kernels/level2/xgemv_fast.opencl
+++ b/src/kernels/level2/xgemv_fast.opencl
@@ -38,7 +38,7 @@ R"(
   #define WGS3 64     // The local work-group size
 #endif
 #ifndef WPT3
-  #define WPT3 1      // The amount of work-per-thread
+  #define WPT3 1      // The tile-size
 #endif
 #ifndef VW3
   #define VW3 1       // Vector width of matrix A loads
@@ -74,18 +74,12 @@ R"(
 
 // =================================================================================================
 
-// Loads a vector input value (1/2)
+// Loads a vector input value
 inline realVF LoadMatrixAVF(const __global realVF* restrict agm, const int x, const int y,
                             const int a_ld) {
   return agm[a_ld*y + x];
 }
 
-// Loads a vector input value (2/2): as before, but different data-type
-inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x, const int y,
-                              const int a_ld) {
-  return agm[a_ld*y + x];
-}
-
 // =================================================================================================
 
 // Faster version of the kernel, assuming that:
@@ -94,23 +88,23 @@ inline realVFR LoadMatrixAVFR(const __global realVFR* restrict agm, const int x,
 // --> 'a_ld' is a multiple of VW2
 // --> 'a_rotated' is 0
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS2, 1, 1)))
-__kernel void XgemvFast(const int m, const int n,
-                        const __constant real* restrict arg_alpha,
-                        const __constant real* restrict arg_beta,
-                        const int a_rotated,
-                        const __global realVF* restrict agm, const int a_offset, const int a_ld,
-                        const __global real* restrict xgm, const int x_offset, const int x_inc,
-                        __global real* ygm, const int y_offset, const int y_inc,
-                        const int do_conjugate, const int parameter,
-                        const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+__kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
+void XgemvFast(const int m, const int n,
+               const real_arg arg_alpha,
+               const real_arg arg_beta,
+               const int a_rotated,
+               const __global realVF* restrict agm, const int a_offset, const int a_ld,
+               const __global real* restrict xgm, const int x_offset, const int x_inc,
+               __global real* ygm, const int y_offset, const int y_inc,
+               const int do_conjugate, const int parameter,
+               const int kl_unused, const int ku_unused) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
 
   // Local memory for the vector X
   __local real xlm[WGS2];
 
-  // Initializes the accumulation register
+  // Initializes the accumulation registers
   real acc[WPT2];
   #pragma unroll
   for (int w=0; w<WPT2; ++w) {
@@ -134,7 +128,7 @@ __kernel void XgemvFast(const int m, const int n,
       #pragma unroll
       for (int w=0; w<WPT2/VW2; ++w) {
         const int gid = (WPT2/VW2)*get_global_id(0) + w;
-        realVF avec = LoadMatrixAVF(agm, gid, k, a_ld/VW2);
+        realVF avec = agm[(a_ld/VW2)*k + gid];
         #if VW2 == 1
           MultiplyAdd(acc[VW2*w+0], xlm[kl], avec);
         #elif VW2 == 2
@@ -196,84 +190,96 @@ __kernel void XgemvFast(const int m, const int n,
 // --> 'a_ld' is a multiple of VW3
 // --> 'a_rotated' is 1
 // --> 'do_conjugate' is 0
-__attribute__((reqd_work_group_size(WGS3, 1, 1)))
-__kernel void XgemvFastRot(const int m, const int n,
-                           const __constant real* restrict arg_alpha,
-                           const __constant real* restrict arg_beta,
-                           const int a_rotated,
-                           const __global realVFR* restrict agm, const int a_offset, const int a_ld,
-                           const __global real* restrict xgm, const int x_offset, const int x_inc,
-                           __global real* ygm, const int y_offset, const int y_inc,
-                           const int do_conjugate, const int parameter,
-                           const int kl, const int ku) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
+__kernel __attribute__((reqd_work_group_size(WGS3, 1, 1)))
+void XgemvFastRot(const int m, const int n,
+                  const real_arg arg_alpha,
+                  const real_arg arg_beta,
+                  const int a_rotated,
+                  const __global realVFR* restrict agm, const int a_offset, const int a_ld,
+                  const __global real* restrict xgm, const int x_offset, const int x_inc,
+                  __global real* ygm, const int y_offset, const int y_inc,
+                  const int do_conjugate, const int parameter,
+                  const int kl_unused, const int ku_unused) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Local memory to store a tile of the matrix (for coalescing)
+  __local real tile[WPT3][WGS3];
+  const int lid = get_local_id(0);
+  const int lid_mod = lid % (WPT3/VW3);
+  const int lid_div = lid / (WPT3/VW3);
 
   // Local memory for the vector X
-  __local real xlm[WGS3];
+  __local real xlm[WPT3];
 
   // Initializes the accumulation register
-  real acc[WPT3];
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    SetToZero(acc[w]);
-  }
+  real acc;
+  SetToZero(acc);
 
-  // Loops over work-group sized portions of the work
-  for (int kwg=0; kwg<n; kwg+=WGS3) {
+  // Loops over tile-sized portions of the work
+  for (int kwg=0; kwg<n; kwg+=WPT3) {
 
     // Loads the vector X into local memory
-    const int lid = get_local_id(0);
-    xlm[lid] = xgm[(kwg + lid)*x_inc + x_offset];
+    if (lid < WPT3) {
+      xlm[lid] = xgm[(kwg + lid) * x_inc + x_offset];
+    }
+
+    // Loads the matrix A into local memory
+    #pragma unroll
+    for (int kl=0; kl<WPT3/VW3; ++kl) {
+      const int x = (kwg/VW3) + lid_mod;
+      const int y = get_group_id(0) * WGS3 + lid_div * (WPT3/VW3) + kl;
+      realVFR avec = agm[(a_ld/VW3) * y + x];
+      #if VW3 == 1
+        tile[kl*VW3 + 0][lid] = avec;
+      #elif VW3 == 2
+        tile[kl*VW3 + 0][lid] = avec.x;
+        tile[kl*VW3 + 1][lid] = avec.y;
+      #elif VW3 == 4
+        tile[kl*VW3 + 0][lid] = avec.x;
+        tile[kl*VW3 + 1][lid] = avec.y;
+        tile[kl*VW3 + 2][lid] = avec.z;
+        tile[kl*VW3 + 3][lid] = avec.w;
+      #elif VW3 == 8
+        tile[kl*VW3 + 0][lid] = avec.s0;
+        tile[kl*VW3 + 1][lid] = avec.s1;
+        tile[kl*VW3 + 2][lid] = avec.s2;
+        tile[kl*VW3 + 3][lid] = avec.s3;
+        tile[kl*VW3 + 4][lid] = avec.s4;
+        tile[kl*VW3 + 5][lid] = avec.s5;
+        tile[kl*VW3 + 6][lid] = avec.s6;
+        tile[kl*VW3 + 7][lid] = avec.s7;
+      #elif VW3 == 16
+        tile[kl*VW3 + 0][lid] = avec.s0;
+        tile[kl*VW3 + 1][lid] = avec.s1;
+        tile[kl*VW3 + 2][lid] = avec.s2;
+        tile[kl*VW3 + 3][lid] = avec.s3;
+        tile[kl*VW3 + 4][lid] = avec.s4;
+        tile[kl*VW3 + 5][lid] = avec.s5;
+        tile[kl*VW3 + 6][lid] = avec.s6;
+        tile[kl*VW3 + 7][lid] = avec.s7;
+        tile[kl*VW3 + 8][lid] = avec.s8;
+        tile[kl*VW3 + 9][lid] = avec.s9;
+        tile[kl*VW3 + 10][lid] = avec.sA;
+        tile[kl*VW3 + 11][lid] = avec.sB;
+        tile[kl*VW3 + 12][lid] = avec.sC;
+        tile[kl*VW3 + 13][lid] = avec.sD;
+        tile[kl*VW3 + 14][lid] = avec.sE;
+        tile[kl*VW3 + 15][lid] = avec.sF;
+      #endif
+    }
 
     // Synchronizes all threads in a workgroup
     barrier(CLK_LOCAL_MEM_FENCE);
 
     // The multiply-add function (rotated)
     #pragma unroll
-    for (int kl=0; kl<WGS3/VW3; ++kl) {
-      const int k = (kwg/VW3) + kl;
+    for (int kl=0; kl<WPT3/VW3; ++kl) {
       #pragma unroll
-      for (int w=0; w<WPT3; ++w) {
-        const int gid = WPT3*get_global_id(0) + w;
-        realVFR avec = LoadMatrixAVFR(agm, k, gid, a_ld/VW3);
-        #if VW3 == 1
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec);
-        #elif VW3 == 2
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-        #elif VW3 == 4
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.x);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.y);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.z);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.w);
-        #elif VW3 == 8
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-        #elif VW3 == 16
-          MultiplyAdd(acc[w], xlm[VW3*kl+0], avec.s0);
-          MultiplyAdd(acc[w], xlm[VW3*kl+1], avec.s1);
-          MultiplyAdd(acc[w], xlm[VW3*kl+2], avec.s2);
-          MultiplyAdd(acc[w], xlm[VW3*kl+3], avec.s3);
-          MultiplyAdd(acc[w], xlm[VW3*kl+4], avec.s4);
-          MultiplyAdd(acc[w], xlm[VW3*kl+5], avec.s5);
-          MultiplyAdd(acc[w], xlm[VW3*kl+6], avec.s6);
-          MultiplyAdd(acc[w], xlm[VW3*kl+7], avec.s7);
-          MultiplyAdd(acc[w], xlm[VW3*kl+8], avec.s8);
-          MultiplyAdd(acc[w], xlm[VW3*kl+9], avec.s9);
-          MultiplyAdd(acc[w], xlm[VW3*kl+10], avec.sA);
-          MultiplyAdd(acc[w], xlm[VW3*kl+11], avec.sB);
-          MultiplyAdd(acc[w], xlm[VW3*kl+12], avec.sC);
-          MultiplyAdd(acc[w], xlm[VW3*kl+13], avec.sD);
-          MultiplyAdd(acc[w], xlm[VW3*kl+14], avec.sE);
-          MultiplyAdd(acc[w], xlm[VW3*kl+15], avec.sF);
-        #endif
+      for (int v=0; v<VW3; ++v) {
+        real aval = tile[lid_mod*VW3 + v][lid_div * (WPT3/VW3) + kl];
+        real xval = xlm[kl*VW3 + v];
+        MultiplyAdd(acc, xval, aval);
       }
     }
 
@@ -282,12 +288,9 @@ __kernel void XgemvFastRot(const int m, const int n,
   }
 
   // Stores the final result
-  #pragma unroll
-  for (int w=0; w<WPT3; ++w) {
-    const int gid = WPT3*get_global_id(0) + w;
-    real yval = ygm[gid*y_inc + y_offset];
-    AXPBY(ygm[gid*y_inc + y_offset], alpha, acc[w], beta, yval);
-  }
+  const int gid = get_global_id(0);
+  real yval = ygm[gid * y_inc + y_offset];
+  AXPBY(ygm[gid * y_inc + y_offset], alpha, acc, beta, yval);
 }
 
 // =================================================================================================
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index 63817afb..1b9ded12 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -18,14 +18,14 @@ R"(
 // =================================================================================================
 
 // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2,
-                   const __constant real* restrict arg_alpha,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   const __global real* ygm, const int y_offset, const int y_inc,
-                   __global real* restrict agm, const int a_offset, const int a_ld,
-                   const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xger(const int max1, const int max2,
+          const real_arg arg_alpha,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          const __global real* ygm, const int y_offset, const int y_inc,
+          __global real* restrict agm, const int a_offset, const int a_ld,
+          const int is_rowmajor) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index fc635f2e..b0772218 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -18,13 +18,13 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n,
-                   const __constant real* restrict arg_alpha,
-                   const __global real* restrict xgm, const int x_offset, const int x_inc,
-                   __global real* restrict agm, const int a_offset, const int a_ld,
-                   const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher(const int n,
+          const real_arg arg_alpha,
+          const __global real* restrict xgm, const int x_offset, const int x_inc,
+          __global real* restrict agm, const int a_offset, const int a_ld,
+          const int is_upper, const int is_rowmajor) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and XT
   real xvalues[WPT];
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index a66f255f..00a756c9 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -18,14 +18,14 @@ R"(
 // =================================================================================================
 
 // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
-__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n,
-                    const __constant real* restrict arg_alpha,
-                    const __global real* restrict xgm, const int x_offset, const int x_inc,
-                    const __global real* restrict ygm, const int y_offset, const int y_inc,
-                    __global real* restrict agm, const int a_offset, const int a_ld,
-                    const int is_upper, const int is_rowmajor) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
+void Xher2(const int n,
+           const real_arg arg_alpha,
+           const __global real* restrict xgm, const int x_offset, const int x_inc,
+           const __global real* restrict ygm, const int y_offset, const int y_inc,
+           __global real* restrict agm, const int a_offset, const int a_ld,
+           const int is_upper, const int is_rowmajor) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Register storage for X and Y
   real xvalues[WPT];
diff --git a/src/kernels/level3/convert_hermitian.opencl b/src/kernels/level3/convert_hermitian.opencl
index 53cc161a..ed2ded98 100644
--- a/src/kernels/level3/convert_hermitian.opencl
+++ b/src/kernels/level3/convert_hermitian.opencl
@@ -20,13 +20,13 @@ R"(
 
 // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -59,13 +59,13 @@ __kernel void HermLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void HermUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void HermUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_symmetric.opencl b/src/kernels/level3/convert_symmetric.opencl
index c6ce93ca..8ae53b37 100644
--- a/src/kernels/level3/convert_symmetric.opencl
+++ b/src/kernels/level3/convert_symmetric.opencl
@@ -20,13 +20,13 @@ R"(
 
 // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -53,13 +53,13 @@ __kernel void SymmLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void SymmUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void SymmUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/convert_triangular.opencl b/src/kernels/level3/convert_triangular.opencl
index fdd2461a..f848dcc1 100644
--- a/src/kernels/level3/convert_triangular.opencl
+++ b/src/kernels/level3/convert_triangular.opencl
@@ -20,14 +20,14 @@ R"(
 
 // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is
 // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaLowerToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaLowerToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -55,14 +55,14 @@ __kernel void TriaLowerToSquared(const int src_dim,
 }
 
 // Same as above, but now the matrix' data is stored in the upper-triangle
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void TriaUpperToSquared(const int src_dim,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_dim,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const int unit_diagonal) {
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void TriaUpperToSquared(const int src_dim,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_dim,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const int unit_diagonal) {
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index 09e54e6d..695b9003 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -35,12 +35,12 @@ R"(
 
 // Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
 // COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
-__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
-__kernel void CopyMatrixFast(const int ld,
-                             __global const realC* restrict src,
-                             __global realC* dest,
-                             const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+void CopyMatrixFast(const int ld,
+                    __global const realC* restrict src,
+                    __global realC* dest,
+                    const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
     const int id_one = get_global_id(0);
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index d276cc60..29480b25 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -24,16 +24,16 @@ R"(
 // Copies a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld
 // value and offset can be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyPadMatrix(const int src_one, const int src_two,
-                            const int src_ld, const int src_offset,
-                            __global const real* restrict src,
-                            const int dest_one, const int dest_two,
-                            const int dest_ld, const int dest_offset,
-                            __global real* dest,
-                            const __constant real* restrict arg_alpha,
-                            const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyPadMatrix(const int src_one, const int src_two,
+                   const int src_ld, const int src_offset,
+                   __global const real* restrict src,
+                   const int dest_one, const int dest_two,
+                   const int dest_ld, const int dest_offset,
+                   __global real* dest,
+                   const real_arg arg_alpha,
+                   const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -65,17 +65,17 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
 // Same as above, but now un-pads a matrix. This kernel reads data from a padded source matrix, but
 // writes only the actual data back to the destination matrix. Again, the ld value and offset can
 // be different.
-__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
-__kernel void CopyMatrix(const int src_one, const int src_two,
-                         const int src_ld, const int src_offset,
-                         __global const real* restrict src,
-                         const int dest_one, const int dest_two,
-                         const int dest_ld, const int dest_offset,
-                         __global real* dest,
-                         const __constant real* restrict arg_alpha,
-                         const int upper, const int lower,
-                         const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1)))
+void CopyMatrix(const int src_one, const int src_two,
+                const int src_ld, const int src_offset,
+                __global const real* restrict src,
+                const int dest_one, const int dest_two,
+                const int dest_ld, const int dest_offset,
+                __global real* dest,
+                const real_arg arg_alpha,
+                const int upper, const int lower,
+                const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index d5c46a30..70156d3a 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -36,12 +36,12 @@ R"(
 
 // Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without
 // offset. A more general version is available in 'padtranspose.opencl'.
-__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
-__kernel void TransposeMatrixFast(const int ld,
-                                  __global const realT* restrict src,
-                                  __global realT* dest,
-                                  const __constant real* restrict arg_alpha) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
+void TransposeMatrixFast(const int ld,
+                         __global const realT* restrict src,
+                         __global realT* dest,
+                         const real_arg arg_alpha) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 2de0c7bd..ba0b7062 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -24,16 +24,16 @@ R"(
 
 // Transposes a matrix from source to destination. The output is padded with zero values in case the
 // destination matrix dimensions are larger than the transposed source matrix dimensions.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposePadMatrix(const int src_one, const int src_two,
-                                 const int src_ld, const int src_offset,
-                                 __global const real* restrict src,
-                                 const int dest_one, const int dest_two,
-                                 const int dest_ld, const int dest_offset,
-                                 __global real* dest,
-                                 const __constant real* restrict arg_alpha,
-                                 const int do_conjugate) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposePadMatrix(const int src_one, const int src_two,
+                        const int src_ld, const int src_offset,
+                        __global const real* restrict src,
+                        const int dest_one, const int dest_two,
+                        const int dest_ld, const int dest_offset,
+                        __global real* dest,
+                        const real_arg arg_alpha,
+                        const int do_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -88,17 +88,17 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
 // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a
 // padded source matrix, but only the actual data is written back to the transposed destination
 // matrix. This kernel optionally checks for upper/lower triangular matrices.
-__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
-__kernel void TransposeMatrix(const int src_one, const int src_two,
-                              const int src_ld, const int src_offset,
-                              __global const real* restrict src,
-                              const int dest_one, const int dest_two,
-                              const int dest_ld, const int dest_offset,
-                              __global real* dest,
-                              const __constant real* restrict arg_alpha,
-                              const int upper, const int lower,
-                              const int diagonal_imag_zero) {
-  const real alpha = arg_alpha[0];
+__kernel __attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1)))
+void TransposeMatrix(const int src_one, const int src_two,
+                     const int src_ld, const int src_offset,
+                     __global const real* restrict src,
+                     const int dest_one, const int dest_two,
+                     const int dest_ld, const int dest_offset,
+                     __global real* dest,
+                     const real_arg arg_alpha,
+                     const int upper, const int lower,
+                     const int diagonal_imag_zero) {
+  const real alpha = GetRealArg(arg_alpha);
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
diff --git a/src/kernels/level3/xgemm_part1.opencl b/src/kernels/level3/xgemm_part1.opencl
index 1ad0a558..d0ce06ad 100644
--- a/src/kernels/level3/xgemm_part1.opencl
+++ b/src/kernels/level3/xgemm_part1.opencl
@@ -31,7 +31,7 @@
 //    o-------o        o-----o  
 //                              
 //
-// This kernel is seperated into two files. This is part 1 out of 2.
+// This kernel is seperated into three files. This is part 1 out of 3.
 //
 // =================================================================================================
 
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index 42c1127c..e8234a29 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -7,7 +7,7 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This is part 2 of 2 of the GEMM kernel. See part 1 for more information.
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
 //
 // =================================================================================================
 
@@ -133,260 +133,98 @@ inline void StoreResults(__global realM* cgm, realM cpm[NWI][MWI/VWM], const int
       #endif
       int idm = mg + GetGroupID0() * (MWG/VWM);
       int idn = ng + GetGroupID1() * NWG;
-
-      // The final multiplication with alpha and the addition with beta*C
       int index = idn*(kSizeM/VWM) + idm;
+
       realM result;
       realM xval = cpm[ni][mi];
-      realM yval = cgm[index];
-      #if VWM == 1
-        AXPBY(result, alpha, xval, beta, yval);
-      #elif VWM == 2
-        AXPBY(result.x, alpha, xval.x, beta, yval.x);
-        AXPBY(result.y, alpha, xval.y, beta, yval.y);
-      #elif VWM == 4
-        AXPBY(result.x, alpha, xval.x, beta, yval.x);
-        AXPBY(result.y, alpha, xval.y, beta, yval.y);
-        AXPBY(result.z, alpha, xval.z, beta, yval.z);
-        AXPBY(result.w, alpha, xval.w, beta, yval.w);
-      #elif VWM == 8
-        AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
-        AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
-        AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
-        AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
-        AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
-        AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
-        AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
-        AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
-      #elif VWM == 16
-        AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
-        AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
-        AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
-        AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
-        AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
-        AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
-        AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
-        AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
-        AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
-        AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
-        AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
-        AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
-        AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
-        AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
-        AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
-        AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
-      #endif
-      cgm[index] = result;
-    }
-  }
-}
-
-// =================================================================================================
-
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
-inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
-                      const __global realM* restrict agm, const __global realN* restrict bgm,
-                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
-                      #if SA == 1 && SB == 1
-                        , __local realM* alm, __local realN* blm
-                      #elif SA == 1
-                        , __local realM* alm
-                      #elif SB == 1
-                        , __local realN* blm
-                      #endif
-                      ) {
-
-  // Allocates workitem-private memory (registers)
-  realM apm[MWI/VWM];
-  realN bpm[NWI/VWN];
-
-  // Combined thread identifier (volatile to disable caching)
-  #if SA == 1 || SB == 1
-    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
-  #endif
-
-  // Initializes the accumulation registers
-  InitAccRegisters(cpm);
-
-  // Loops over all workgroup tiles
-  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
 
-    // Loads data: off-chip --> local (matrix A)
-    #if SA == 1
-      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
-    #endif
-    // Loads data: off-chip --> local (matrix B)
-    #if SB == 1
-      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
-    #endif
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-
-    // Loops over all workitem tiles, unrolled by a factor KWI
-    for (int pwi=0; pwi<KWG; pwi+=KWI) {
-      #pragma unroll
-      for (int pit=0; pit<KWI; ++pit) {
-        #if SA == 0 || SB == 0
-          int idk = kwg + pwi + pit;
-        #endif
-        #if SA == 1 || SB == 1
-          int kg = pwi+pit;
-        #endif
-
-        // Loads data: local --> private (matrix A)
-        #if SA == 1
-          LocalToPrivateA(alm, apm, kg);
-        // Loads data: off-chip --> private (matrix A)
-        #else
-          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+      // The final multiplication with alpha (in case beta == 0)
+      if (IsZero(beta)) {
+        #if VWM == 1
+          Multiply(result, alpha, xval);
+        #elif VWM == 2
+          Multiply(result.x, alpha, xval.x);
+          Multiply(result.y, alpha, xval.y);
+        #elif VWM == 4
+          Multiply(result.x, alpha, xval.x);
+          Multiply(result.y, alpha, xval.y);
+          Multiply(result.z, alpha, xval.z);
+          Multiply(result.w, alpha, xval.w);
+        #elif VWM == 8
+          Multiply(result.s0, alpha, xval.s0);
+          Multiply(result.s1, alpha, xval.s1);
+          Multiply(result.s2, alpha, xval.s2);
+          Multiply(result.s3, alpha, xval.s3);
+          Multiply(result.s4, alpha, xval.s4);
+          Multiply(result.s5, alpha, xval.s5);
+          Multiply(result.s6, alpha, xval.s6);
+          Multiply(result.s7, alpha, xval.s7);
+        #elif VWM == 16
+          Multiply(result.s0, alpha, xval.s0);
+          Multiply(result.s1, alpha, xval.s1);
+          Multiply(result.s2, alpha, xval.s2);
+          Multiply(result.s3, alpha, xval.s3);
+          Multiply(result.s4, alpha, xval.s4);
+          Multiply(result.s5, alpha, xval.s5);
+          Multiply(result.s6, alpha, xval.s6);
+          Multiply(result.s7, alpha, xval.s7);
+          Multiply(result.s8, alpha, xval.s8);
+          Multiply(result.s9, alpha, xval.s9);
+          Multiply(result.sA, alpha, xval.sA);
+          Multiply(result.sB, alpha, xval.sB);
+          Multiply(result.sC, alpha, xval.sC);
+          Multiply(result.sD, alpha, xval.sD);
+          Multiply(result.sE, alpha, xval.sE);
+          Multiply(result.sF, alpha, xval.sF);
         #endif
+      }
 
-        // Loads data: local --> private (matrix B)
-        #if SB == 1
-          LocalToPrivateB(blm, bpm, kg);
-        // Loads data: off-chip --> private (matrix B)
-        #else
-          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        realM yval = cgm[index];
+        #if VWM == 1
+          AXPBY(result, alpha, xval, beta, yval);
+        #elif VWM == 2
+          AXPBY(result.x, alpha, xval.x, beta, yval.x);
+          AXPBY(result.y, alpha, xval.y, beta, yval.y);
+        #elif VWM == 4
+          AXPBY(result.x, alpha, xval.x, beta, yval.x);
+          AXPBY(result.y, alpha, xval.y, beta, yval.y);
+          AXPBY(result.z, alpha, xval.z, beta, yval.z);
+          AXPBY(result.w, alpha, xval.w, beta, yval.w);
+        #elif VWM == 8
+          AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+          AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+          AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+          AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+          AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+          AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+          AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+          AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+        #elif VWM == 16
+          AXPBY(result.s0, alpha, xval.s0, beta, yval.s0);
+          AXPBY(result.s1, alpha, xval.s1, beta, yval.s1);
+          AXPBY(result.s2, alpha, xval.s2, beta, yval.s2);
+          AXPBY(result.s3, alpha, xval.s3, beta, yval.s3);
+          AXPBY(result.s4, alpha, xval.s4, beta, yval.s4);
+          AXPBY(result.s5, alpha, xval.s5, beta, yval.s5);
+          AXPBY(result.s6, alpha, xval.s6, beta, yval.s6);
+          AXPBY(result.s7, alpha, xval.s7, beta, yval.s7);
+          AXPBY(result.s8, alpha, xval.s8, beta, yval.s8);
+          AXPBY(result.s9, alpha, xval.s9, beta, yval.s9);
+          AXPBY(result.sA, alpha, xval.sA, beta, yval.sA);
+          AXPBY(result.sB, alpha, xval.sB, beta, yval.sB);
+          AXPBY(result.sC, alpha, xval.sC, beta, yval.sC);
+          AXPBY(result.sD, alpha, xval.sD, beta, yval.sD);
+          AXPBY(result.sE, alpha, xval.sE, beta, yval.sE);
+          AXPBY(result.sF, alpha, xval.sF, beta, yval.sF);
         #endif
-
-        // Performs the accumulation (Cpm += Apm * Bpm)
-        MultiplyAccumulate(cpm, apm, bpm);
       }
+      cgm[index] = result;
     }
-    #if SA == 1 || SB == 1
-      barrier(CLK_LOCAL_MEM_FENCE);
-    #endif
-  }
-  #if GLOBAL_MEM_FENCE == 1
-    barrier(CLK_GLOBAL_MEM_FENCE);
-  #endif
-}
-
-// =================================================================================================
-// The upper-triangular and lower-triangular kernels are only used in special cases
-#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
-
-// Main entry point of the kernel. This is the upper-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
-
-  // Skip these threads if they do not contain threads contributing to the upper-triangle
-  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
-    return;
-  }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// Main entry point of the kernel. This is the lower-triangular version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void XgemmLower(const int kSizeN, const int kSizeK,
-                         const __constant real* restrict arg_alpha,
-                         const __constant real* restrict arg_beta,
-                         const __global realM* restrict agm,
-                         const __global realN* restrict bgm,
-                         __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
-
-  // Skip these threads if they do not contain threads contributing to the lower-triangle
-  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
-    return;
   }
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeN, alpha, beta);
-}
-
-// =================================================================================================
-// If not using a triangular version, include the regular kernel
-#else
-
-// Main entry point of the kernel. This is the regular full version.
-__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
-__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
-                    const __constant real* restrict arg_alpha,
-                    const __constant real* restrict arg_beta,
-                    const __global realM* restrict agm,
-                    const __global realN* restrict bgm,
-                    __global realM* cgm) {
-  const real alpha = arg_alpha[0];
-  const real beta = arg_beta[0];
-
-  // Allocates workgroup-private memory (local memory)
-  #if SA == 1
-    __local realM alm[KWG * MWG/VWM];
-  #endif
-  #if SB == 1
-    __local realN blm[KWG * NWG/VWN];
-  #endif
-
-  // Computes the matrix-multiplication and stores the result in register memory
-  realM cpm[NWI][MWI/VWM];
-  #if SA == 1 && SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
-  #elif SA == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
-  #elif SB == 1
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
-  #else
-    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
-  #endif
-
-  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
-  StoreResults(cgm, cpm, kSizeM, alpha, beta);
 }
 
-#endif
 // =================================================================================================
 
 // End of the C++11 raw string literal
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
new file mode 100644
index 00000000..a5faef5a
--- /dev/null
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -0,0 +1,229 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
+                      const __global realM* restrict agm, const __global realN* restrict bgm,
+                      __global realM* cgm, realM cpm[NWI][MWI/VWM]
+                      #if SA == 1 && SB == 1
+                        , __local realM* alm, __local realN* blm
+                      #elif SA == 1
+                        , __local realM* alm
+                      #elif SB == 1
+                        , __local realN* blm
+                      #endif
+                      ) {
+
+  // Allocates workitem-private memory (registers)
+  realM apm[MWI/VWM];
+  realN bpm[NWI/VWN];
+
+  // Combined thread identifier (volatile to disable caching)
+  #if SA == 1 || SB == 1
+    volatile int tid = get_local_id(0) + MDIMC*get_local_id(1);
+  #endif
+
+  // Initializes the accumulation registers
+  InitAccRegisters(cpm);
+
+  // Loops over all workgroup tiles
+  for (int kwg=0; kwg<kSizeK; kwg+=KWG) {
+
+    // Loads data: off-chip --> local (matrix A)
+    #if SA == 1
+      GlobalToLocalA(agm, alm, kSizeM, tid, kwg);
+    #endif
+    // Loads data: off-chip --> local (matrix B)
+    #if SB == 1
+      GlobalToLocalB(bgm, blm, kSizeN, tid, kwg);
+    #endif
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+
+    // Loops over all workitem tiles, unrolled by a factor KWI
+    for (int pwi=0; pwi<KWG; pwi+=KWI) {
+      #pragma unroll
+      for (int pit=0; pit<KWI; ++pit) {
+        #if SA == 0 || SB == 0
+          int idk = kwg + pwi + pit;
+        #endif
+        #if SA == 1 || SB == 1
+          int kg = pwi+pit;
+        #endif
+
+        // Loads data: local --> private (matrix A)
+        #if SA == 1
+          LocalToPrivateA(alm, apm, kg);
+        // Loads data: off-chip --> private (matrix A)
+        #else
+          GlobalToPrivateA(agm, apm, kSizeM, idk, kwg);
+        #endif
+
+        // Loads data: local --> private (matrix B)
+        #if SB == 1
+          LocalToPrivateB(blm, bpm, kg);
+        // Loads data: off-chip --> private (matrix B)
+        #else
+          GlobalToPrivateB(bgm, bpm, kSizeN, idk);
+        #endif
+
+        // Performs the accumulation (Cpm += Apm * Bpm)
+        MultiplyAccumulate(cpm, apm, bpm);
+      }
+    }
+    #if SA == 1 || SB == 1
+      barrier(CLK_LOCAL_MEM_FENCE);
+    #endif
+  }
+  #if GLOBAL_MEM_FENCE == 1
+    barrier(CLK_GLOBAL_MEM_FENCE);
+  #endif
+}
+
+// =================================================================================================
+// The upper-triangular and lower-triangular kernels are only used in special cases
+#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K)
+
+// Main entry point of the kernel. This is the upper-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmUpper(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Skip these threads if they do not contain threads contributing to the upper-triangle
+  if (GetGroupID1()*NWG < GetGroupID0()*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// Main entry point of the kernel. This is the lower-triangular version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void XgemmLower(const int kSizeN, const int kSizeK,
+                const real_arg arg_alpha,
+                const real_arg arg_beta,
+                const __global realM* restrict agm,
+                const __global realN* restrict bgm,
+                __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Skip these threads if they do not contain threads contributing to the lower-triangle
+  if (GetGroupID1()*NWG > GetGroupID0()*MWG) {
+    return;
+  }
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeN, alpha, beta);
+}
+
+// =================================================================================================
+// If not using a triangular version, include the regular kernel
+#else
+
+// Main entry point of the kernel. This is the regular full version.
+__kernel __attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
+void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
+           const real_arg arg_alpha,
+           const real_arg arg_beta,
+           const __global realM* restrict agm,
+           const __global realN* restrict bgm,
+           __global realM* cgm) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Allocates workgroup-private memory (local memory)
+  #if SA == 1
+    __local realM alm[KWG * MWG/VWM];
+  #endif
+  #if SB == 1
+    __local realN blm[KWG * NWG/VWN];
+  #endif
+
+  // Computes the matrix-multiplication and stores the result in register memory
+  realM cpm[NWI][MWI/VWM];
+  #if SA == 1 && SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm);
+  #elif SA == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm);
+  #elif SB == 1
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm);
+  #else
+    XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm);
+  #endif
+
+  // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta
+  StoreResults(cgm, cpm, kSizeM, alpha, beta);
+}
+
+#endif
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/public_api.hpp b/src/public_api.hpp
deleted file mode 100644
index d0732297..00000000
--- a/src/public_api.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file provides macro's to define the public API. This is needed when building a Windows DLL.
-// Note: this is only used for the C++ interface, the C interface has its own definition included in
-// the header file itself.
-//
-// =================================================================================================
-
-#ifndef CLBLAST_PUBLIC_API_H_
-#define CLBLAST_PUBLIC_API_H_
-
-namespace clblast {
-// =================================================================================================
-
-// Exports library functions under Windows when building a DLL. See also:
-// https://msdn.microsoft.com/en-us/library/a90k134d.aspx
-#ifdef _WIN32
-  #define PUBLIC_API __declspec(dllexport)
-#else
-  #define PUBLIC_API
-#endif
-
-// =================================================================================================
-} // namespace clblast
-
-// CLBLAST_PUBLIC_API_H_
-#endif
diff --git a/src/routine.cpp b/src/routine.cpp
index d3590896..189ae190 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -13,6 +13,7 @@
 
 #include <string>
 #include <vector>
+#include <chrono>
 
 #include "routine.hpp"
 
@@ -21,7 +22,8 @@ namespace clblast {
 
 // Constructor: not much here, because no status codes can be returned
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
-                 const std::vector<std::string> &routines, const Precision precision):
+                 const std::vector<std::string> &routines, const Precision precision,
+                 const std::vector<Database::DatabaseEntry> &userDatabase):
     precision_(precision),
     routine_name_(name),
     queue_(queue),
@@ -29,7 +31,7 @@ Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
     context_(queue_.GetContext()),
     device_(queue_.GetDevice()),
     device_name_(device_.Name()),
-    db_(queue_, routines, precision_) {
+    db_(queue_, routines, precision_, userDatabase) {
 }
 
 // =================================================================================================
@@ -103,6 +105,13 @@ StatusCode Routine::SetUp() {
   // Combines everything together into a single source string
   const auto source_string = defines + common_header + source_string_;
 
+  // Prints details of the routine to compile in case of debugging in verbose mode
+  #ifdef VERBOSE
+    printf("[DEBUG] Compiling routine '%s-%s' for device '%s'\n",
+           routine_name_.c_str(), ToString(precision_).c_str(), device_name_.c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
   // Compiles the kernel
   try {
     auto program = Program(context_, source_string);
@@ -123,6 +132,13 @@ StatusCode Routine::SetUp() {
     StoreProgramToCache(program, context_, precision_, routine_name_);
   } catch (...) { return StatusCode::kBuildProgramFailure; }
 
+  // Prints the elapsed compilation time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed compilation in %.2lf ms\n", timing);
+  #endif
+
   // No errors, normal termination of this function
   return StatusCode::kSuccess;
 }
diff --git a/src/routine.hpp b/src/routine.hpp
index 54b5779f..f5c607af 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -32,9 +32,11 @@ namespace clblast {
 class Routine {
  public:
 
-  // Base class constructor
+  // Base class constructor. The user database is an optional extra database to override the
+  // built-in database.
   explicit Routine(Queue &queue, EventPointer event, const std::string &name,
-                   const std::vector<std::string> &routines, const Precision precision);
+                   const std::vector<std::string> &routines, const Precision precision,
+                   const std::vector<Database::DatabaseEntry> &userDatabase = {});
 
   // Set-up phase of the kernel
   StatusCode SetUp();
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index c378df28..3969cf9f 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -12,6 +12,7 @@
 // =================================================================================================
 
 #include <vector>
+#include <chrono>
 
 #include "routines/common.hpp"
 
@@ -21,45 +22,54 @@ namespace clblast {
 // Enqueues a kernel, waits for completion, and checks for errors
 StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                      std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, std::vector<Event>& waitForEvents) {
+                     EventPointer event, const std::vector<Event> &waitForEvents) {
 
-  // Tests for validity of the local thread sizes
-  if (local.size() > device.MaxWorkItemDimensions()) {
-    return StatusCode::kInvalidLocalNumDimensions; 
-  }
-  const auto max_work_item_sizes = device.MaxWorkItemSizes();
-  for (auto i=size_t{0}; i<local.size(); ++i) {
-    if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
-  }
-  auto local_size = size_t{1};
-  for (auto &item: local) { local_size *= item; }
-  if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
+  if (!local.empty()) {
+    // Tests for validity of the local thread sizes
+    if (local.size() > device.MaxWorkItemDimensions()) {
+      return StatusCode::kInvalidLocalNumDimensions;
+    }
+    const auto max_work_item_sizes = device.MaxWorkItemSizes();
+    for (auto i=size_t{0}; i<local.size(); ++i) {
+      if (local[i] > max_work_item_sizes[i]) { return StatusCode::kInvalidLocalThreadsDim; }
+    }
+    auto local_size = size_t{1};
+    for (auto &item: local) { local_size *= item; }
+    if (local_size > device.MaxWorkGroupSize()) { return StatusCode::kInvalidLocalThreadsTotal; }
 
-  // Make sure the global thread sizes are at least equal to the local sizes
-  for (auto i=size_t{0}; i<global.size(); ++i) {
-    if (global[i] < local[i]) { global[i] = local[i]; }
+    // Make sure the global thread sizes are at least equal to the local sizes
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      if (global[i] < local[i]) { global[i] = local[i]; }
+    }
   }
 
   // Tests for local memory usage
   const auto local_mem_usage = kernel.LocalMemUsage(device);
   if (!device.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
 
+  // Prints the name of the kernel to launch in case of debugging in verbose mode
+  #ifdef VERBOSE
+    queue.Finish();
+    printf("[DEBUG] Running kernel '%s'\n", kernel.GetFunctionName().c_str());
+    const auto start_time = std::chrono::steady_clock::now();
+  #endif
+
   // Launches the kernel (and checks for launch errors)
   try {
     kernel.Launch(queue, global, local, event, waitForEvents);
   } catch (...) { return StatusCode::kKernelLaunchError; }
 
+  // Prints the elapsed execution time in case of debugging in verbose mode
+  #ifdef VERBOSE
+    queue.Finish();
+    const auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+    const auto timing = std::chrono::duration<double,std::milli>(elapsed_time).count();
+    printf("[DEBUG] Completed kernel in %.2lf ms\n", timing);
+  #endif
+
   // No errors, normal termination of this function
   return StatusCode::kSuccess;
 }
 
-// As above, but without an event waiting list
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event) {
-  auto emptyWaitingList = std::vector<Event>();
-  return RunKernel(kernel, queue, device, global, local, event, emptyWaitingList);
-}
-
 // =================================================================================================
 } // namespace clblast
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index c99cd39d..9d8849c3 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -29,21 +29,16 @@ namespace clblast {
 // Enqueues a kernel, waits for completion, and checks for errors
 StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
                      std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event, std::vector<Event>& waitForEvents);
-
-// As above, but without an event waiting list
-StatusCode RunKernel(Kernel &kernel, Queue &queue, const Device &device,
-                     std::vector<size_t> global, const std::vector<size_t> &local,
-                     EventPointer event);
+                     EventPointer event, const std::vector<Event> &waitForEvents = {});
 
 // =================================================================================================
 
 // Copies or transposes a matrix and optionally pads/unpads it with zeros. This method is also able
 // to write to symmetric and triangular matrices through optional arguments.
 template <typename T>
-StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Context &context,
+StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device,
                                   const Database &db,
-                                  EventPointer event, std::vector<Event>& waitForEvents,
+                                  EventPointer event, const std::vector<Event> &waitForEvents,
                                   const size_t src_one, const size_t src_two,
                                   const size_t src_ld, const size_t src_offset,
                                   const Buffer<T> &src,
@@ -88,10 +83,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
     }
   }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context, 1);
-  alpha_buffer.Write(queue, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     auto kernel = Kernel(program, kernel_name);
@@ -101,7 +92,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
       kernel.SetArgument(0, static_cast<int>(src_ld));
       kernel.SetArgument(1, src());
       kernel.SetArgument(2, dest());
-      kernel.SetArgument(3, alpha_buffer());
+      kernel.SetArgument(3, GetRealArg(alpha));
     }
     else {
       kernel.SetArgument(0, static_cast<int>(src_one));
@@ -114,7 +105,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont
       kernel.SetArgument(7, static_cast<int>(dest_ld));
       kernel.SetArgument(8, static_cast<int>(dest_offset));
       kernel.SetArgument(9, dest());
-      kernel.SetArgument(10, alpha_buffer());
+      kernel.SetArgument(10, GetRealArg(alpha));
       if (do_pad) {
         kernel.SetArgument(11, static_cast<int>(do_conjugate));
       }
diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp
index 5b6c9e77..3445e2b5 100644
--- a/src/routines/level1/xaxpy.cpp
+++ b/src/routines/level1/xaxpy.cpp
@@ -59,20 +59,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
     auto kernel = Kernel(program, kernel_name);
 
-    // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-
     // Sets the kernel arguments
     if (use_fast_kernel) {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha_buffer());
+      kernel.SetArgument(1, GetRealArg(alpha));
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, y_buffer());
     }
     else {
       kernel.SetArgument(0, static_cast<int>(n));
-      kernel.SetArgument(1, alpha_buffer());
+      kernel.SetArgument(1, GetRealArg(alpha));
       kernel.SetArgument(2, x_buffer());
       kernel.SetArgument(3, static_cast<int>(x_offset));
       kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp
index 21fb397c..4e32ba41 100644
--- a/src/routines/level2/xgemv.cpp
+++ b/src/routines/level2/xgemv.cpp
@@ -22,7 +22,7 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemv<T>::Xgemv(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Pad", "Xgemv"}, PrecisionValue<T>()) {
+    Routine(queue, event, name, {"Pad", "Xgemv", "XgemvFast", "XgemvFastRot"}, PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level2/xgemv.opencl"
     #include "../../kernels/level2/xgemv_fast.opencl"
@@ -122,16 +122,10 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
   }
   if (fast_kernel_rot) {
     kernel_name = "XgemvFastRot";
-    global_size = m_real / db_["WPT3"];
+    global_size = m_real;
     local_size = db_["WGS3"];
   }
 
-  // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  auto beta_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-  beta_buffer.Write(queue_, 1, &beta);
-
   // Retrieves the Xgemv kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -140,8 +134,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose,
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(m_real));
     kernel.SetArgument(1, static_cast<int>(n_real));
-    kernel.SetArgument(2, alpha_buffer());
-    kernel.SetArgument(3, beta_buffer());
+    kernel.SetArgument(2, GetRealArg(alpha));
+    kernel.SetArgument(3, GetRealArg(beta));
     kernel.SetArgument(4, static_cast<int>(a_rotated));
     kernel.SetArgument(5, a_buffer());
     kernel.SetArgument(6, static_cast<int>(a_offset));
diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp
index 353047d2..29cffe0c 100644
--- a/src/routines/level2/xger.cpp
+++ b/src/routines/level2/xger.cpp
@@ -56,10 +56,6 @@ StatusCode Xger<T>::DoGer(const Layout layout,
   status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -68,7 +64,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(a_one));
     kernel.SetArgument(1, static_cast<int>(a_two));
-    kernel.SetArgument(2, alpha_buffer());
+    kernel.SetArgument(2, GetRealArg(alpha));
     kernel.SetArgument(3, x_buffer());
     kernel.SetArgument(4, static_cast<int>(x_offset));
     kernel.SetArgument(5, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp
index ed8ba9e9..6dd95938 100644
--- a/src/routines/level2/xher.cpp
+++ b/src/routines/level2/xher.cpp
@@ -70,10 +70,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
   // Creates a matching version of alpha
   const auto matching_alpha = GetAlpha(alpha);
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &matching_alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -81,7 +77,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha_buffer());
+    kernel.SetArgument(1, GetRealArg(matching_alpha));
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp
index 50572cea..3d57a9b9 100644
--- a/src/routines/level2/xher2.cpp
+++ b/src/routines/level2/xher2.cpp
@@ -58,10 +58,6 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
   status = TestVectorY(n, y_buffer, y_offset, y_inc);
   if (ErrorIn(status)) { return status; }
 
-  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
-  auto alpha_buffer = Buffer<T>(context_, 1);
-  alpha_buffer.Write(queue_, 1, &alpha);
-
   // Retrieves the kernel from the compiled binary
   try {
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
@@ -69,7 +65,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
 
     // Sets the kernel arguments
     kernel.SetArgument(0, static_cast<int>(n));
-    kernel.SetArgument(1, alpha_buffer());
+    kernel.SetArgument(1, GetRealArg(alpha));
     kernel.SetArgument(2, x_buffer());
     kernel.SetArgument(3, static_cast<int>(x_offset));
     kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 9ea5559c..0b8e768f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -34,6 +34,7 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_hermitian.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
@@ -63,9 +64,12 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) ||
                          (layout == Layout::kRowMajor && b_transpose == Transpose::kNo);
   const auto c_rotated = (layout == Layout::kRowMajor);
-  const auto a_do_transpose =  a_rotated;
-  const auto b_do_transpose = !b_rotated;
-  const auto c_do_transpose =  c_rotated;
+  static const auto a_want_rotated = false;
+  static const auto b_want_rotated = true;
+  static const auto c_want_rotated = false;
+  const auto a_do_transpose = a_rotated != a_want_rotated;
+  const auto b_do_transpose = b_rotated != b_want_rotated;
+  const auto c_do_transpose = c_rotated != c_want_rotated;
 
   // In case of complex data-types, the transpose can also become a conjugate transpose
   const auto a_conjugate = (a_transpose == Transpose::kConjugate);
@@ -99,6 +103,15 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   const auto n_ceiled = Ceil(n, db_["NWG"]);
   const auto k_ceiled = Ceil(k, db_["KWG"]);
 
+  // Computes the first and second "internal" (ceiled) dimensions of the 3 matrices taking into account
+  // whether the matrices need to be rotated or not for the kernel.
+  const auto a_one_i = (a_want_rotated) ? k_ceiled : m_ceiled;
+  const auto a_two_i = (a_want_rotated) ? m_ceiled : k_ceiled;
+  const auto b_one_i = (b_want_rotated) ? n_ceiled : k_ceiled;
+  const auto b_two_i = (b_want_rotated) ? k_ceiled : n_ceiled;
+  const auto c_one_i = (c_want_rotated) ? n_ceiled : m_ceiled;
+  const auto c_two_i = (c_want_rotated) ? m_ceiled : n_ceiled;
+
   // The padded/transposed input/output matrices: if memory allocation fails, throw an exception
   try {
 
@@ -106,23 +119,17 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
     // Determines whether or not temporary matrices are needed
-    auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 &&
+    auto a_no_temp = a_one == a_one_i && a_two == a_two_i && a_ld == a_one && a_offset == 0 &&
                      a_do_transpose == false && a_conjugate == false;
-    auto b_no_temp = b_one == n_ceiled && b_two == k_ceiled && b_ld == n_ceiled && b_offset == 0 &&
+    auto b_no_temp = b_one == b_one_i && b_two == b_two_i && b_ld == b_one && b_offset == 0 &&
                      b_do_transpose == false && b_conjugate == false;
-    auto c_no_temp = c_one == m_ceiled && c_two == n_ceiled && c_ld == m_ceiled && c_offset == 0 &&
+    auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
                      c_do_transpose == false;
 
     // Creates the temporary matrices
-    const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled);
-    const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
-    const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
-
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
+    const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
+    const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
+    const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -133,9 +140,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
-                                      m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
+                                      a_one_i, a_two_i, a_one_i, 0, a_temp,
                                       ConstantOne<T>(), program,
                                       true, a_do_transpose, a_conjugate);
       if (ErrorIn(status)) { return status; }
@@ -145,9 +152,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // As above, but now for matrix B
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       b_one, b_two, b_ld, b_offset, b_buffer,
-                                      n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
+                                      b_one_i, b_two_i, b_one_i, 0, b_temp,
                                       ConstantOne<T>(), program,
                                       true, b_do_transpose, b_conjugate);
       if (ErrorIn(status)) { return status; }
@@ -157,9 +164,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
     // As above, but now for matrix C. This is only necessary if C is used both as input and output.
     if (!c_no_temp && beta != static_cast<T>(0)) {
       auto eventProcessC = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                       c_one, c_two, c_ld, c_offset, c_buffer,
-                                      m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+                                      c_one_i, c_two_i, c_one_i, 0, c_temp,
                                       ConstantOne<T>(), program,
                                       true, c_do_transpose, false);
       if (ErrorIn(status)) { return status; }
@@ -174,16 +181,16 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       kernel.SetArgument(0, static_cast<int>(m_ceiled));
       kernel.SetArgument(1, static_cast<int>(n_ceiled));
       kernel.SetArgument(2, static_cast<int>(k_ceiled));
-      kernel.SetArgument(3, alpha_buffer());
-      kernel.SetArgument(4, beta_buffer());
+      kernel.SetArgument(3, GetRealArg(alpha));
+      kernel.SetArgument(4, GetRealArg(beta));
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, b_temp());
       kernel.SetArgument(7, c_temp());
 
       // Computes the global and local thread sizes
       const auto global = std::vector<size_t>{
-        (m_ceiled * db_["MDIMC"]) / db_["MWG"],
-        (n_ceiled * db_["NDIMC"]) / db_["NWG"]
+        (c_one_i * db_["MDIMC"]) / db_["MWG"],
+        (c_two_i * db_["NDIMC"]) / db_["NWG"]
       };
       const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]};
 
@@ -196,8 +203,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       // Runs the post-processing kernel if needed
       if (!c_no_temp) {
         eventWaitList.push_back(eventKernel);
-        status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
-                                        m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
+        status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
+                                        c_one_i, c_two_i, c_one_i, 0, c_temp,
                                         c_one, c_two, c_ld, c_offset, c_buffer,
                                         ConstantOne<T>(), program,
                                         false, c_do_transpose, false);
diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp
index bd7a053e..ba770065 100644
--- a/src/routines/level3/xher2k.cpp
+++ b/src/routines/level3/xher2k.cpp
@@ -31,6 +31,7 @@ Xher2k<T,U>::Xher2k(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
@@ -107,12 +108,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    // Convert the arguments to complex versions
     auto complex_beta = T{beta, static_cast<U>(0.0)};
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &complex_beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -123,7 +120,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     // case nothing has to be done, these kernels can be skipped.
     if (!a1_no_temp) {
       auto eventProcessA1 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA1.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA1.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
                                       ConstantOne<T>(), program,
@@ -133,7 +130,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     }
     if (!a2_no_temp) {
       auto eventProcessA2 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA2.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA2.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
                                       ConstantOne<T>(), program,
@@ -143,7 +140,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     }
     if (!b1_no_temp) {
       auto eventProcessB1 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB1.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB1.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
                                       ConstantOne<T>(), program,
@@ -153,7 +150,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     }
     if (!b2_no_temp) {
       auto eventProcessB2 = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB2.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB2.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
                                       ConstantOne<T>(), program,
@@ -165,7 +162,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -180,8 +177,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(complex_beta));
       kernel.SetArgument(4, a1_temp());
       kernel.SetArgument(5, b2_temp());
       kernel.SetArgument(6, c_temp());
@@ -202,10 +199,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
       auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
       auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
-      alpha_buffer.Write(queue_, 1, &conjugate_alpha);
-      beta_buffer.Write(queue_, 1, &complex_one);
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(conjugate_alpha));
+      kernel.SetArgument(3, GetRealArg(complex_one));
       kernel.SetArgument(4, b1_temp());
       kernel.SetArgument(5, a2_temp());
 
@@ -218,7 +213,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp
index 6ef7f21f..3063f3bc 100644
--- a/src/routines/level3/xherk.cpp
+++ b/src/routines/level3/xherk.cpp
@@ -31,6 +31,7 @@ Xherk<T,U>::Xherk(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
@@ -98,13 +99,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+    // Convert the arguments to complex versions
     auto complex_alpha = T{alpha, static_cast<U>(0.0)};
     auto complex_beta = T{beta, static_cast<U>(0.0)};
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &complex_alpha);
-    beta_buffer.Write(queue_, 1, &complex_beta);
 
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
@@ -115,7 +112,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     // case nothing has to be done, these kernels can be skipped. Two copies are created.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -125,7 +122,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     }
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                       ConstantOne<T>(), program,
@@ -137,7 +134,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -152,8 +149,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(complex_alpha));
+      kernel.SetArgument(3, GetRealArg(complex_beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
@@ -174,7 +171,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp
index 424d4d2d..158cd9e5 100644
--- a/src/routines/level3/xsyr2k.cpp
+++ b/src/routines/level3/xsyr2k.cpp
@@ -31,6 +31,7 @@ Xsyr2k<T>::Xsyr2k(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
@@ -97,12 +98,6 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -112,7 +107,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -122,7 +117,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     }
     if (!b_no_temp) {
       auto eventProcessB = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessB.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
                                       ConstantOne<T>(), program,
@@ -134,7 +129,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -149,8 +144,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, b_temp());
       kernel.SetArgument(6, c_temp());
@@ -170,8 +165,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
 
       // Swaps the arguments for matrices A and B, and sets 'beta' to 1
       auto one = static_cast<T>(1);
-      beta_buffer.Write(queue_, 1, &one);
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(3, GetRealArg(one));
       kernel.SetArgument(4, b_temp());
       kernel.SetArgument(5, a_temp());
 
@@ -184,7 +178,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp
index f56c232b..e1a72ef6 100644
--- a/src/routines/level3/xsyrk.cpp
+++ b/src/routines/level3/xsyrk.cpp
@@ -31,6 +31,7 @@ Xsyrk<T>::Xsyrk(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/transpose_pad.opencl"
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
+    #include "../../kernels/level3/xgemm_part3.opencl"
   ;
 }
 
@@ -90,12 +91,6 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
     auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
 
-    // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
-    auto alpha_buffer = Buffer<T>(context_, 1);
-    auto beta_buffer = Buffer<T>(context_, 1);
-    alpha_buffer.Write(queue_, 1, &alpha);
-    beta_buffer.Write(queue_, 1, &beta);
-
     // Events of all kernels (including pre/post processing kernels)
     auto eventWaitList = std::vector<Event>();
     auto emptyEventList = std::vector<Event>();
@@ -105,7 +100,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     // case nothing has to be done, these kernels can be skipped.
     if (!a_no_temp) {
       auto eventProcessA = Event();
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessA.pointer(), emptyEventList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
                                       ConstantOne<T>(), program,
@@ -117,7 +112,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     // Furthermore, also creates a (possibly padded) copy of matrix C, since it is not allowed to
     // modify the other triangle.
     auto eventProcessC = Event();
-    status = PadCopyTransposeMatrix(queue_, device_, context_, db_, eventProcessC.pointer(), emptyEventList,
+    status = PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                     ConstantOne<T>(), program,
@@ -132,8 +127,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       // Sets the kernel arguments
       kernel.SetArgument(0, static_cast<int>(n_ceiled));
       kernel.SetArgument(1, static_cast<int>(k_ceiled));
-      kernel.SetArgument(2, alpha_buffer());
-      kernel.SetArgument(3, beta_buffer());
+      kernel.SetArgument(2, GetRealArg(alpha));
+      kernel.SetArgument(3, GetRealArg(beta));
       kernel.SetArgument(4, a_temp());
       kernel.SetArgument(5, a_temp());
       kernel.SetArgument(6, c_temp());
@@ -154,7 +149,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       // Runs the post-processing kernel
       auto upper = (triangle == Triangle::kUpper);
       auto lower = (triangle == Triangle::kLower);
-      status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, eventWaitList,
+      status = PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
                                       ConstantOne<T>(), program,
diff --git a/src/routines/levelx/xomatcopy.cpp b/src/routines/levelx/xomatcopy.cpp
index e8593301..af9080af 100644
--- a/src/routines/levelx/xomatcopy.cpp
+++ b/src/routines/levelx/xomatcopy.cpp
@@ -72,7 +72,7 @@ StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_trans
   const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
 
   auto emptyEventList = std::vector<Event>();
-  status = PadCopyTransposeMatrix(queue_, device_, context_, db_, event_, emptyEventList,
+  status = PadCopyTransposeMatrix(queue_, device_, db_, event_, emptyEventList,
                                   a_one, a_two, a_ld, a_offset, a_buffer,
                                   b_one, b_two, b_ld, b_offset, b_buffer,
                                   alpha, program, false, transpose, conjugate);
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 34269bc7..78ded56e 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -86,11 +86,10 @@ class TuneCopy {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 1e0dccd3..90f5ea82 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -86,7 +86,6 @@ class TunePad {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -97,7 +96,7 @@ class TunePad {
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 7ac19cb6..10fa80cb 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -91,11 +91,10 @@ class TuneTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 63274415..507718eb 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -90,7 +90,6 @@ class TunePadTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -101,7 +100,7 @@ class TunePadTranspose {
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 88d12c1f..0033b3c6 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -89,9 +89,8 @@ class TuneXaxpy {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentOutput(y_vec);
   }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4b1efdef..4cb7fd00 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -7,7 +7,9 @@
 // Author(s):
 //   Cedric Nugteren <www.cedricnugteren.nl>
 //
-// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels.
+// This file uses the CLTune auto-tuner to tune the xgemm OpenCL kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
 //
 // =================================================================================================
 
@@ -21,18 +23,19 @@ namespace clblast {
 // =================================================================================================
 
 // See comment at top of file for a description of the class
-template <typename T>
+template <typename T, int V>
 class TuneXgemm {
  public:
 
   // The representative kernel and the source code
-  static std::string KernelFamily() { return "xgemm"; }
+  static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; }
   static std::string KernelName() { return "Xgemm"; }
   static std::string GetSources() {
     return
       #include "../src/kernels/common.opencl"
       #include "../src/kernels/level3/xgemm_part1.opencl"
       #include "../src/kernels/level3/xgemm_part2.opencl"
+      #include "../src/kernels/level3/xgemm_part3.opencl"
     ;
   }
 
@@ -48,7 +51,7 @@ class TuneXgemm {
   static size_t DefaultM() { return 1024; }
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
-  static double DefaultFraction() { return 2048.0; }
+  static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -60,20 +63,38 @@ class TuneXgemm {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
-    tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
-    tuner.AddParameter(id, "KWG", {16, 32});
-    tuner.AddParameter(id, "MDIMC", {8, 16, 32});
-    tuner.AddParameter(id, "NDIMC", {8, 16, 32});
-    tuner.AddParameter(id, "MDIMA", {8, 16, 32});
-    tuner.AddParameter(id, "NDIMB", {8, 16, 32});
-    tuner.AddParameter(id, "KWI", {2, 8});
-    tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
-    tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
-    tuner.AddParameter(id, "STRM", {0, 1});
-    tuner.AddParameter(id, "STRN", {0, 1});
-    tuner.AddParameter(id, "SA", {0, 1});
-    tuner.AddParameter(id, "SB", {0, 1});
+    if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+      tuner.AddParameter(id, "MWG", {16, 32, 64});
+      tuner.AddParameter(id, "NWG", {16, 32, 64});
+      tuner.AddParameter(id, "KWG", {32});
+      tuner.AddParameter(id, "MDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMA", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMB", {8, 16, 32});
+      tuner.AddParameter(id, "KWI", {2});
+      tuner.AddParameter(id, "VWM", {1, 2, 4});
+      tuner.AddParameter(id, "VWN", {1, 2, 4});
+      tuner.AddParameter(id, "STRM", {0});
+      tuner.AddParameter(id, "STRN", {0});
+      tuner.AddParameter(id, "SA", {0, 1});
+      tuner.AddParameter(id, "SB", {0, 1});
+    } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+    else {
+      tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
+      tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
+      tuner.AddParameter(id, "KWG", {16, 32});
+      tuner.AddParameter(id, "MDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMC", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMA", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMB", {8, 16, 32});
+      tuner.AddParameter(id, "KWI", {2});
+      tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
+      tuner.AddParameter(id, "STRM", {0, 1});
+      tuner.AddParameter(id, "STRN", {0, 1});
+      tuner.AddParameter(id, "SA", {0, 1});
+      tuner.AddParameter(id, "SB", {0, 1});
+    }
   }
 
   // Sets the constraints
@@ -92,6 +113,14 @@ class TuneXgemm {
     // KWG has to be a multiple of KDIMA = ((MDIMC*NDIMC)/(MDIMA)) and KDIMB = (...)
     tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "MDIMA"});
     tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"KWG", "MDIMC", "NDIMC", "NDIMB"});
+
+    // Extra constraints for variation 1 to limit the set of options significantly
+    if (V==1) {
+      auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+      tuner.AddConstraint(id, IsEqual, {"MDIMC", "MDIMA"});
+      tuner.AddConstraint(id, IsEqual, {"NDIMC", "NDIMB"});
+      tuner.AddConstraint(id, IsEqual, {"SA", "SB"});
+    }
   }
 
   // Sets the local memory size
@@ -121,13 +150,11 @@ class TuneXgemm {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
-    auto beta_buffer = std::vector<T>{args.beta};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.k));
-    tuner.AddArgumentInput(alpha_buffer);
-    tuner.AddArgumentInput(beta_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentInput(b_mat);
     tuner.AddArgumentOutput(c_mat);
@@ -147,15 +174,22 @@ class TuneXgemm {
 using float2 = clblast::float2;
 using double2 = clblast::double2;
 
-// Main function (not within the clblast namespace)
-int main(int argc, char *argv[]) {
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
   switch(clblast::GetPrecision(argc, argv)) {
-    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
-    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
-    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
-    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;
-    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2>, double2>(argc, argv); break;
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half,V>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float,V>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double,V>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2,V>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemm<double2,V>, double2>(argc, argv); break;
   }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
   return 0;
 }
 
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index d42155ae..7229602d 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -29,7 +29,7 @@ class TuneXgemv {
  public:
 
   // The representative kernel and the source code
-  static std::string KernelFamily() { return "xgemv_"+std::to_string(V); }
+  static std::string KernelFamily() { return (V==1) ? "xgemv" : ((V==2) ? "xgemv_fast" : "xgemv_fast_rot"); }
   static std::string KernelName() { return (V==1) ? "Xgemv" : ((V==2) ? "XgemvFast" : "XgemvFastRot"); }
   static std::string GetSources() {
     return
@@ -61,21 +61,42 @@ class TuneXgemv {
 
   // Sets the tuning parameters and their possible values
   static void SetParameters(cltune::Tuner &tuner, const size_t id) {
-    tuner.AddParameter(id, "WGS"+std::to_string(V), {64, 128, 256});
-    tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
-    if (V==2 || V==3) { tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8}); }
+    if (V==1) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {32, 64, 128, 256});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+    }
+    if (V==2) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128, 256});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4});
+      tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
+    }
+    if (V==3) {
+      tuner.AddParameter(id, "WGS"+std::to_string(V), {16, 32, 64, 128});
+      tuner.AddParameter(id, "WPT"+std::to_string(V), {1, 2, 4, 8, 16, 32});
+      tuner.AddParameter(id, "VW"+std::to_string(V), {1, 2, 4, 8});
+    }
   }
 
   // Sets the constraints and local memory size
   static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
-    auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
     if (V==2 || V==3) {
+      auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
       tuner.AddConstraint(id, MultipleOfX, {"WPT"+std::to_string(V), "VW"+std::to_string(V)});
     }
+    if (V==3) {
+      auto LargerOrEqual = [] (std::vector<size_t> v) { return v[0] >= v[1]; };
+      tuner.AddConstraint(id, LargerOrEqual, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+    }
   }
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
-    auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
+    if (V==1 || V==2) {
+      auto LocalMemorySize = [args] (std::vector<size_t> v) { return v[0]*GetBytes(args.precision); };
+      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V)});
+    }
+    else {
+      auto LocalMemorySize = [args] (std::vector<size_t> v) { return (v[0]*v[1] + v[1])*GetBytes(args.precision); };
+      tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGS"+std::to_string(V), "WPT"+std::to_string(V)});
+    }
   }
 
   // Sets the base thread configuration
@@ -89,20 +110,21 @@ class TuneXgemv {
   static TransformVector MulLocal() { return {{"WGS"+std::to_string(V)}}; }
   static TransformVector DivLocal() { return {}; }
   static TransformVector MulGlobal() { return {}; }
-  static TransformVector DivGlobal() { return {{"WPT"+std::to_string(V)}}; }
+  static TransformVector DivGlobal() {
+    if (V==1 || V==2) return {{"WPT"+std::to_string(V)}};
+    return {};
+  }
 
   // Sets the kernel's arguments
   static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
-    auto beta_buffer = std::vector<T>{args.beta};
     auto a_rotated = (V==3) ? 1 : 0;
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
-    tuner.AddArgumentInput(beta_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
     tuner.AddArgumentScalar(static_cast<int>(a_rotated));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentScalar(0);
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index d2590c53..1fb5c531 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -85,10 +85,9 @@ class TuneXger {
                            std::vector<T> &x_vec, std::vector<T> &y_vec,
                            std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
                            std::vector<T> &) {
-    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
-    tuner.AddArgumentInput(alpha_buffer);
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
     tuner.AddArgumentInput(x_vec);
     tuner.AddArgumentScalar(0); // x_offset
     tuner.AddArgumentScalar(1); // x_increment
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 68e480c5..77bc72d7 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -161,6 +161,8 @@ template <typename T>
 T ConvertArgument(const char* value) {
   return static_cast<T>(std::stoi(value));
 }
+template size_t ConvertArgument(const char* value);
+
 template <> half ConvertArgument(const char* value) {
   return FloatToHalf(static_cast<float>(std::stod(value)));
 }
@@ -179,6 +181,15 @@ template <> double2 ConvertArgument(const char* value) {
   return double2{val, val};
 }
 
+// Variant of "ConvertArgument" with default values
+template <typename T>
+T ConvertArgument(const char* value, T default_value) {
+
+  if (value) { return ConvertArgument<T>(value); }
+  return default_value;
+}
+template size_t ConvertArgument(const char* value, size_t default_value);
+
 // This function matches patterns in the form of "-option value" or "--option value". It returns a
 // default value in case the option is not found in the argument string.
 template <typename T>
@@ -332,6 +343,14 @@ void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_com
   result.Write(queue, size, result_cpu);
 }
 
+// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
+// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
+template <> typename RealArg<half>::Type GetRealArg(const half value) { return HalfToFloat(value); }
+template <> typename RealArg<float>::Type GetRealArg(const float value) { return value; }
+template <> typename RealArg<double>::Type GetRealArg(const double value) { return value; }
+template <> typename RealArg<float2>::Type GetRealArg(const float2 value) { return value; }
+template <> typename RealArg<double2>::Type GetRealArg(const double2 value) { return value; }
+
 // =================================================================================================
 
 // Rounding functions performing ceiling and division operations
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 5a4eef0f..75bd5a69 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -80,8 +80,9 @@ constexpr auto kArgComparecblas = "cblas";
 constexpr auto kArgStepSize = "step";
 constexpr auto kArgNumSteps = "num_steps";
 constexpr auto kArgNumRuns = "runs";
+constexpr auto kArgWarmUp = "warm_up";
 
-// The client-specific arguments in string form
+// The test-specific arguments in string form
 constexpr auto kArgFullTest = "full_test";
 constexpr auto kArgVerbose = "verbose";
 
@@ -186,6 +187,10 @@ std::string ToString(T value);
 template <typename T>
 T ConvertArgument(const char* value);
 
+// Variant of "ConvertArgument" with default values
+template <typename T>
+T ConvertArgument(const char* value, T default_value);
+
 // Basic argument parser, matching patterns in the form of "-option value" and "--option value"
 template <typename T>
 T GetArgument(const int argc, char **argv, std::string &help,
@@ -226,6 +231,12 @@ void FloatToHalfBuffer(std::vector<half>& result, const std::vector<float>& sour
 Buffer<float> HalfToFloatBuffer(const Buffer<half>& source, cl_command_queue queue_raw);
 void FloatToHalfBuffer(Buffer<half>& result, const Buffer<float>& source, cl_command_queue queue_raw);
 
+// Converts a 'real' value to a 'real argument' value to be passed to a kernel. Normally there is
+// no conversion, but half-precision is not supported as kernel argument so it is converted to float.
+template <typename T> struct RealArg { using Type = T; };
+template <> struct RealArg<half> { using Type = float; };
+template <typename T> typename RealArg<T>::Type GetRealArg(const T value);
+
 // =================================================================================================
 
 // Rounding functions