38 files changed, 1713 insertions, 188 deletions
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index d57223dd..aaa76cb4 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -12,8 +12,8 @@
 // Portability here means that a similar header exists for CUDA with the same classes and
 // interfaces. In other words, moving from the OpenCL API to the CUDA API becomes a one-line change.
 //
-// This file is taken from the Claduc project <https://github.com/CNugteren/Claduc> and therefore
-// contains the following header copyright notice:
+// This file is taken from the CLCudaAPI project <https://github.com/CNugteren/CLCudaAPI> and
+// therefore contains the following header copyright notice:
 //
 // =================================================================================================
 //
@@ -97,14 +97,12 @@ class Event {
   // http://stackoverflow.com/questions/26145603/clgeteventprofilinginfo-bug-in-macosx
   float GetElapsedTime() const {
     WaitForCompletion();
-    auto bytes = size_t{0};
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, 0, nullptr, &bytes);
-    auto time_start = size_t{0};
+    const auto bytes = sizeof(cl_ulong);
+    auto time_start = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_START, bytes, &time_start, nullptr);
-    clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, 0, nullptr, &bytes);
-    auto time_end = size_t{0};
+    auto time_end = cl_ulong{0};
     clGetEventProfilingInfo(*event_, CL_PROFILING_COMMAND_END, bytes, &time_end, nullptr);
-    return (time_end - time_start) * 1.0e-6f;
+    return static_cast<float>(time_end - time_start) * 1.0e-6f;
   }
 
   // Accessor to the private data-member
@@ -152,6 +150,17 @@ class Platform {
   cl_platform_id platform_;
 };
 
+// Retrieves a vector with all platforms
+inline std::vector<Platform> GetAllPlatforms() {
+  auto num_platforms = cl_uint{0};
+  CheckError(clGetPlatformIDs(0, nullptr, &num_platforms));
+  auto all_platforms = std::vector<Platform>();
+  for (size_t platform_id = 0; platform_id < static_cast<size_t>(num_platforms); ++platform_id) {
+    all_platforms.push_back(Platform(platform_id));
+  }
+  return all_platforms;
+}
+
 // =================================================================================================
 
 // C++11 version of 'cl_device_id'
@@ -201,8 +210,8 @@ class Device {
   std::vector<size_t> MaxWorkItemSizes() const {
     return GetInfoVector<size_t>(CL_DEVICE_MAX_WORK_ITEM_SIZES);
   }
-  cl_ulong LocalMemSize() const {
-    return GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE);
+  unsigned long LocalMemSize() const {
+    return static_cast<unsigned long>(GetInfo<cl_ulong>(CL_DEVICE_LOCAL_MEM_SIZE));
   }
   std::string Capabilities() const { return GetInfoString(CL_DEVICE_EXTENSIONS); }
   size_t CoreClock() const {
@@ -238,9 +247,11 @@ class Device {
   // Query for a specific type of device or brand
   bool IsCPU() const { return Type() == "CPU"; }
   bool IsGPU() const { return Type() == "GPU"; }
-  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc."; }
+  bool IsAMD() const { return Vendor() == "AMD" || Vendor() == "Advanced Micro Devices, Inc." ||
+                              Vendor() == "AuthenticAMD";; }
   bool IsNVIDIA() const { return Vendor() == "NVIDIA" || Vendor() == "NVIDIA Corporation"; }
-  bool IsIntel() const { return Vendor() == "Intel" || Vendor() == "GenuineIntel"; }
+  bool IsIntel() const { return Vendor() == "INTEL" || Vendor() == "Intel" ||
+                                Vendor() == "GenuineIntel"; }
   bool IsARM() const { return Vendor() == "ARM"; }
 
   // Accessor to the private data-member
@@ -606,8 +617,7 @@ class Buffer {
 
   // Retrieves the actual allocated size in bytes
   size_t GetSize() const {
-    auto bytes = size_t{0};
-    CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, 0, nullptr, &bytes));
+    const auto bytes = sizeof(size_t);
     auto result = size_t{0};
     CheckError(clGetMemObjectInfo(*buffer_, CL_MEM_SIZE, bytes, &result, nullptr));
     return result;
@@ -658,17 +668,16 @@ class Kernel {
   }
 
   // Retrieves the amount of local memory used per work-group for this kernel
-  cl_ulong LocalMemUsage(const Device &device) const {
-    auto bytes = size_t{0};
+  unsigned long LocalMemUsage(const Device &device) const {
+    const auto bytes = sizeof(cl_ulong);
     auto query = cl_kernel_work_group_info{CL_KERNEL_LOCAL_MEM_SIZE};
-    CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, 0, nullptr, &bytes));
     auto result = cl_ulong{0};
     CheckError(clGetKernelWorkGroupInfo(*kernel_, device(), query, bytes, &result, nullptr));
-    return result;
+    return static_cast<unsigned long>(result);
   }
 
   // Retrieves the name of the kernel
-  std::string GetFunctionName() {
+  std::string GetFunctionName() const {
     auto bytes = size_t{0};
     CheckError(clGetKernelInfo(*kernel_, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &bytes));
     auto result = std::string{};
@@ -689,6 +698,7 @@ class Kernel {
   void Launch(const Queue &queue, const std::vector<size_t> &global,
               const std::vector<size_t> &local, EventPointer event,
               const std::vector<Event> &waitForEvents) {
+
     // Builds a plain version of the events waiting list
     auto waitForEventsPlain = std::vector<cl_event>();
     for (auto &waitEvent : waitForEvents) {
diff --git a/src/database/database.cpp b/src/database/database.cpp
index 34c44a29..2340a89c 100644
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@@ -21,27 +21,42 @@
 #include "database/kernels/xgemv_fast_rot.hpp"
 #include "database/kernels/xger.hpp"
 #include "database/kernels/xgemm.hpp"
+#include "database/kernels/xgemm_direct.hpp"
 #include "database/kernels/copy.hpp"
 #include "database/kernels/pad.hpp"
 #include "database/kernels/transpose.hpp"
 #include "database/kernels/padtranspose.hpp"
+#include "database/kernel_selection.hpp"
 
 namespace clblast {
 // =================================================================================================
 
 // Initializes the database
-const std::vector<Database::DatabaseEntry> Database::database = {
-  XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble,
-  XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble,
-  XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble,
-  XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble,
-  XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble,
-  XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble,
-  XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble,
-  CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble,
-  PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble,
-  TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble,
-  PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble
+const std::vector<const Database::DatabaseEntry*> Database::database = {
+  &database::XaxpyHalf, &database::XaxpySingle, &database::XaxpyDouble, &database::XaxpyComplexSingle, &database::XaxpyComplexDouble,
+  &database::XdotHalf, &database::XdotSingle, &database::XdotDouble, &database::XdotComplexSingle, &database::XdotComplexDouble,
+  &database::XgemvHalf, &database::XgemvSingle, &database::XgemvDouble, &database::XgemvComplexSingle, &database::XgemvComplexDouble,
+  &database::XgemvFastHalf, &database::XgemvFastSingle, &database::XgemvFastDouble, &database::XgemvFastComplexSingle, &database::XgemvFastComplexDouble,
+  &database::XgemvFastRotHalf, &database::XgemvFastRotSingle, &database::XgemvFastRotDouble, &database::XgemvFastRotComplexSingle, &database::XgemvFastRotComplexDouble,
+  &database::XgerHalf, &database::XgerSingle, &database::XgerDouble, &database::XgerComplexSingle, &database::XgerComplexDouble,
+  &database::XgemmHalf, &database::XgemmSingle, &database::XgemmDouble, &database::XgemmComplexSingle, &database::XgemmComplexDouble,
+  &database::XgemmDirectHalf, &database::XgemmDirectSingle, &database::XgemmDirectDouble, &database::XgemmDirectComplexSingle, &database::XgemmDirectComplexDouble,
+  &database::CopyHalf, &database::CopySingle, &database::CopyDouble, &database::CopyComplexSingle, &database::CopyComplexDouble,
+  &database::PadHalf, &database::PadSingle, &database::PadDouble, &database::PadComplexSingle, &database::PadComplexDouble,
+  &database::TransposeHalf, &database::TransposeSingle, &database::TransposeDouble, &database::TransposeComplexSingle, &database::TransposeComplexDouble,
+  &database::PadtransposeHalf, &database::PadtransposeSingle, &database::PadtransposeDouble, &database::PadtransposeComplexSingle, &database::PadtransposeComplexDouble,
+  &database::KernelSelectionHalf, &database::KernelSelectionSingle, &database::KernelSelectionDouble, &database::KernelSelectionComplexSingle, &database::KernelSelectionComplexDouble
+};
+
+// The OpenCL device vendors
+const std::string Database::kDeviceVendorAll = "default";
+
+// Alternative names for some OpenCL vendors
+const std::unordered_map<std::string, std::string> Database::kVendorNames{
+  { "Intel(R) Corporation", "Intel" },
+  { "GenuineIntel", "Intel" },
+  { "Advanced Micro Devices, Inc.", "AMD" },
+  { "NVIDIA Corporation", "NVIDIA" },
 };
 
 // =================================================================================================
@@ -49,7 +64,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
 // Constructor, computing device properties and populating the parameter-vector from the database.
 // This takes an optional overlay database in case of custom tuning or custom kernels.
 Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
-                   const Precision precision, const std::vector<DatabaseEntry> &overlay):
+                   const Precision precision, const std::vector<const DatabaseEntry*> &overlay):
   parameters_{} {
 
   // Finds information of the current device
@@ -69,8 +84,8 @@ Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
   for (auto &kernel: kernels) {
     auto search_result = ParametersPtr{};
 
-    for (auto db: { &overlay, &database }) {
-      search_result = Search(kernel, device_type, device_vendor, device_name, precision, *db);
+    for (auto &db: { database, overlay}) {
+      search_result = Search(kernel, device_type, device_vendor, device_name, precision, db);
       if (search_result) {
         parameters_.insert(search_result->begin(), search_result->end());
         break;
@@ -100,17 +115,17 @@ Database::ParametersPtr Database::Search(const std::string &this_kernel,
                                          const std::string &this_vendor,
                                          const std::string &this_device,
                                          const Precision this_precision,
-                                         const std::vector<DatabaseEntry> &this_database) const {
+                                         const std::vector<const DatabaseEntry*> &this_database) const {
 
   // Selects the right kernel
   for (auto &db: this_database) {
-    if (db.kernel == this_kernel && db.precision == this_precision) {
+    if (db->kernel == this_kernel && db->precision == this_precision) {
 
       // Searches for the right vendor and device type, or selects the default if unavailable. This
       // assumes that the default vendor / device type is last in the database.
-      for (auto &vendor: db.vendors) {
+      for (auto &vendor: db->vendors) {
         if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
-            (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
+            (vendor.type == this_type || vendor.type == database::kDeviceTypeAll)) {
 
           // Searches for the right device. If the current device is unavailable, selects the vendor
           // default parameters. This assumes the default is last in the database.
diff --git a/src/database/database.hpp b/src/database/database.hpp
index a6ab49c5..8a3e7040 100644
--- a/src/database/database.hpp
+++ b/src/database/database.hpp
@@ -26,6 +26,19 @@
 namespace clblast {
 // =================================================================================================
 
+// A special namespace to hold all the global constant variables (including the database entries)
+namespace database {
+
+  // The OpenCL device types
+  const std::string kDeviceTypeCPU = "CPU";
+  const std::string kDeviceTypeGPU = "GPU";
+  const std::string kDeviceTypeAccelerator = "accelerator";
+  const std::string kDeviceTypeAll = "default";
+
+} // namespace database
+
+// =================================================================================================
+
 // See comment at top of file for a description of the class
 class Database {
  public:
@@ -36,54 +49,32 @@ class Database {
 
   // Structures for content inside the database
   struct DatabaseDevice {
-    const std::string name;
-    const Parameters parameters;
+    std::string name;
+    Parameters parameters;
   };
   struct DatabaseVendor {
-    const std::string type;
-    const std::string name;
-    const std::vector<DatabaseDevice> devices;
+    std::string type;
+    std::string name;
+    std::vector<DatabaseDevice> devices;
   };
   struct DatabaseEntry {
-    const std::string kernel;
-    const Precision precision;
-    const std::vector<DatabaseVendor> vendors;
+    std::string kernel;
+    Precision precision;
+    std::vector<DatabaseVendor> vendors;
   };
 
-  // The OpenCL device types
-  static constexpr auto kDeviceTypeCPU = "CPU";
-  static constexpr auto kDeviceTypeGPU = "GPU";
-  static constexpr auto kDeviceTypeAccelerator = "accelerator";
-  static constexpr auto kDeviceTypeAll = "default";
-
   // The OpenCL device vendors
-  static constexpr auto kDeviceVendorAll = "default";
+  static const std::string kDeviceVendorAll;
 
   // Alternative names for some OpenCL vendors
-  const std::unordered_map<std::string,std::string> kVendorNames {
-    {"Intel(R) Corporation", "Intel"},
-    {"GenuineIntel", "Intel"},
-    {"Advanced Micro Devices, Inc.", "AMD"},
-    {"NVIDIA Corporation", "NVIDIA"},
-  };
+  static const std::unordered_map<std::string, std::string> kVendorNames;
 
   // The database consists of separate database entries, stored together in a vector
-  static const DatabaseEntry XaxpyHalf, XaxpySingle, XaxpyDouble, XaxpyComplexSingle, XaxpyComplexDouble;
-  static const DatabaseEntry XdotHalf, XdotSingle, XdotDouble, XdotComplexSingle, XdotComplexDouble;
-  static const DatabaseEntry XgemvHalf, XgemvSingle, XgemvDouble, XgemvComplexSingle, XgemvComplexDouble;
-  static const DatabaseEntry XgemvFastHalf, XgemvFastSingle, XgemvFastDouble, XgemvFastComplexSingle, XgemvFastComplexDouble;
-  static const DatabaseEntry XgemvFastRotHalf, XgemvFastRotSingle, XgemvFastRotDouble, XgemvFastRotComplexSingle, XgemvFastRotComplexDouble;
-  static const DatabaseEntry XgerHalf, XgerSingle, XgerDouble, XgerComplexSingle, XgerComplexDouble;
-  static const DatabaseEntry XgemmHalf, XgemmSingle, XgemmDouble, XgemmComplexSingle, XgemmComplexDouble;
-  static const DatabaseEntry CopyHalf, CopySingle, CopyDouble, CopyComplexSingle, CopyComplexDouble;
-  static const DatabaseEntry PadHalf, PadSingle, PadDouble, PadComplexSingle, PadComplexDouble;
-  static const DatabaseEntry TransposeHalf, TransposeSingle, TransposeDouble, TransposeComplexSingle, TransposeComplexDouble;
-  static const DatabaseEntry PadtransposeHalf, PadtransposeSingle, PadtransposeDouble, PadtransposeComplexSingle, PadtransposeComplexDouble;
-  static const std::vector<DatabaseEntry> database;
+  static const std::vector<const DatabaseEntry*> database;
 
   // The constructor with a user-provided database overlay (potentially an empty vector)
   explicit Database(const Queue &queue, const std::vector<std::string> &routines,
-                    const Precision precision, const std::vector<DatabaseEntry> &overlay);
+                    const Precision precision, const std::vector<const DatabaseEntry*> &overlay);
 
   // Accessor of values by key
   size_t operator[](const std::string key) const { return parameters_.find(key)->second; }
@@ -95,7 +86,8 @@ class Database {
   // Search method for a specified database, returning pointer (possibly a nullptr)
   ParametersPtr Search(const std::string &this_kernel, const std::string &this_type,
                        const std::string &this_vendor, const std::string &this_device,
-                       const Precision this_precision, const std::vector<DatabaseEntry> &db) const;
+                       const Precision this_precision,
+                       const std::vector<const DatabaseEntry*> &db) const;
 
   // Found parameters suitable for this device/kernel
   Parameters parameters_;
diff --git a/src/database/kernel_selection.hpp b/src/database/kernel_selection.hpp
new file mode 100644
index 00000000..7e5e7821
--- /dev/null
+++ b/src/database/kernel_selection.hpp
@@ -0,0 +1,131 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This determines when to switch between the direct (for small sizes) and in-direct GEMM kernel
+// with pre/post-processing kernels (for larger sizes). These can be set in a similar way as for the
+// regular kernel tuning parameters: they can be specific for a certain vendor or device or can use
+// some common default values.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionHalf = {
+  "KernelSelection", Precision::kHalf, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionSingle = {
+  "KernelSelection", Precision::kSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexSingle = {
+  "KernelSelection", Precision::kComplexSingle, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionDouble = {
+  "KernelSelection", Precision::kDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry KernelSelectionComplexDouble = {
+  "KernelSelection", Precision::kComplexDouble, {
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",384*384*384} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",768*768*768} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"XGEMM_MIN_INDIRECT_SIZE",512*512*512} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/copy.hpp b/src/database/kernels/copy.hpp
index a6b7dfe8..16aa6b3f 100644
--- a/src/database/kernels/copy.hpp
+++ b/src/database/kernels/copy.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyHalf = {
+const Database::DatabaseEntry CopyHalf = {
   "Copy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::CopyHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopySingle = {
+const Database::DatabaseEntry CopySingle = {
   "Copy", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::CopySingle = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",4}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::CopySingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyComplexSingle = {
+const Database::DatabaseEntry CopyComplexSingle = {
   "Copy", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -128,6 +130,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",4}, {"COPY_WPT",4} } },
         { "Iris",                                            { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
         { "Iris Pro",                                        { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -147,7 +150,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
         { "GeForce GTX 480",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",2} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",4} } },
@@ -165,7 +168,7 @@ const Database::DatabaseEntry Database::CopyComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyDouble = {
+const Database::DatabaseEntry CopyDouble = {
   "Copy", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -205,13 +208,13 @@ const Database::DatabaseEntry Database::CopyDouble = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "Tesla K20m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",1} } },
         { "Tesla K40m",                                      { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",2}, {"COPY_WPT",2} } },
-        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",32}, {"COPY_VW",2}, {"COPY_WPT",1} } },
+        { "default",                                         { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",2}, {"COPY_WPT",1} } },
       }
     },
     { // Default
@@ -224,7 +227,7 @@ const Database::DatabaseEntry Database::CopyDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::CopyComplexDouble = {
+const Database::DatabaseEntry CopyComplexDouble = {
   "Copy", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -264,7 +267,7 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
         { "GeForce GTX 670",                                 { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 680",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 750",                                 { {"COPY_DIMX",32}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",32}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX 980",                                 { {"COPY_DIMX",8}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"COPY_DIMX",16}, {"COPY_DIMY",16}, {"COPY_VW",1}, {"COPY_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"COPY_DIMX",16}, {"COPY_DIMY",8}, {"COPY_VW",1}, {"COPY_WPT",1} } },
@@ -282,4 +285,5 @@ const Database::DatabaseEntry Database::CopyComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/pad.hpp b/src/database/kernels/pad.hpp
index 3cfabaf4..6c5e0c2f 100644
--- a/src/database/kernels/pad.hpp
+++ b/src/database/kernels/pad.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadHalf = {
+const Database::DatabaseEntry PadHalf = {
   "Pad", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadSingle = {
+const Database::DatabaseEntry PadSingle = {
   "Pad", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "Iris Pro",                                        { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::PadSingle = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",4}, {"PAD_WPTY",2} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",4}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::PadSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadComplexSingle = {
+const Database::DatabaseEntry PadComplexSingle = {
   "Pad", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,10 +136,11 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris",                                            { {"PAD_DIMX",32}, {"PAD_DIMY",16}, {"PAD_WPTX",2}, {"PAD_WPTY",4} } },
         { "Iris Pro",                                        { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",4} } },
+        { "default",                                         { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
       }
     },
     { // Intel accelerators
@@ -154,13 +157,13 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "Tesla K20m",                                      { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "Tesla K40m",                                      { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",2}, {"PAD_WPTY",1} } },
+        { "default",                                         { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
       }
     },
     { // Default
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::PadComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadDouble = {
+const Database::DatabaseEntry PadDouble = {
   "Pad", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::PadDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadComplexDouble = {
+const Database::DatabaseEntry PadComplexDouble = {
   "Pad", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
         { "GeForce GTX 670",                                 { {"PAD_DIMX",32}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 680",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 750",                                 { {"PAD_DIMX",8}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",32}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
+        { "GeForce GTX 750 Ti",                              { {"PAD_DIMX",16}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX 980",                                 { {"PAD_DIMX",16}, {"PAD_DIMY",16}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
         { "GeForce GTX TITAN",                               { {"PAD_DIMX",8}, {"PAD_DIMY",32}, {"PAD_WPTX",1}, {"PAD_WPTY",2} } },
         { "GeForce GTX TITAN X",                             { {"PAD_DIMX",16}, {"PAD_DIMY",8}, {"PAD_WPTX",1}, {"PAD_WPTY",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::PadComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/padtranspose.hpp b/src/database/kernels/padtranspose.hpp
index 88bd4ea7..4003ec6d 100644
--- a/src/database/kernels/padtranspose.hpp
+++ b/src/database/kernels/padtranspose.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeHalf = {
+const Database::DatabaseEntry PadtransposeHalf = {
   "Padtranspose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::PadtransposeHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeSingle = {
+const Database::DatabaseEntry PadtransposeSingle = {
   "Padtranspose", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris",                                            { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::PadtransposeSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
+const Database::DatabaseEntry PadtransposeComplexSingle = {
   "Padtranspose", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",4} } },
         { "Iris",                                            { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
         { "Iris Pro",                                        { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeDouble = {
+const Database::DatabaseEntry PadtransposeDouble = {
   "Padtranspose", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::PadtransposeDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
+const Database::DatabaseEntry PadtransposeComplexDouble = {
   "Padtranspose", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
         { "GeForce GTX 670",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 680",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
         { "GeForce GTX 750",                                 { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"PADTRA_PAD",1}, {"PADTRA_TILE",8}, {"PADTRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"PADTRA_PAD",0}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN",                               { {"PADTRA_PAD",1}, {"PADTRA_TILE",16}, {"PADTRA_WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"PADTRA_PAD",1}, {"PADTRA_TILE",32}, {"PADTRA_WPT",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::PadtransposeComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/transpose.hpp b/src/database/kernels/transpose.hpp
index 0e1b608e..c5ea50c2 100644
--- a/src/database/kernels/transpose.hpp
+++ b/src/database/kernels/transpose.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeHalf = {
+const Database::DatabaseEntry TransposeHalf = {
   "Transpose", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::TransposeHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeSingle = {
+const Database::DatabaseEntry TransposeSingle = {
   "Transpose", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",1}, {"TRA_WPT",4} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",4} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::TransposeSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeComplexSingle = {
+const Database::DatabaseEntry TransposeComplexSingle = {
   "Transpose", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "Iris",                                            { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "Iris Pro",                                        { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
@@ -159,7 +162,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
     },
     { // Default
       kDeviceTypeAll, "default", {
-        { "default",                                         { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "default",                                         { {"TRA_DIM",8}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
       }
     },
   }
@@ -167,7 +170,7 @@ const Database::DatabaseEntry Database::TransposeComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeDouble = {
+const Database::DatabaseEntry TransposeDouble = {
   "Transpose", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -207,7 +210,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
         { "GeForce GTX 670",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 680",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",1}, {"TRA_WPT",2} } },
         { "GeForce GTX 750",                                 { {"TRA_DIM",16}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX 980",                                 { {"TRA_DIM",16}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN",                               { {"TRA_DIM",8}, {"TRA_PAD",0}, {"TRA_SHUFFLE",0}, {"TRA_WPT",2} } },
         { "GeForce GTX TITAN X",                             { {"TRA_DIM",32}, {"TRA_PAD",1}, {"TRA_SHUFFLE",0}, {"TRA_WPT",1} } },
@@ -226,7 +229,7 @@ const Database::DatabaseEntry Database::TransposeDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::TransposeComplexDouble = {
+const Database::DatabaseEntry TransposeComplexDouble = {
   "Transpose", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -278,4 +281,5 @@ const Database::DatabaseEntry Database::TransposeComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xaxpy.hpp b/src/database/kernels/xaxpy.hpp
index 9c1bcd99..60471bef 100644
--- a/src/database/kernels/xaxpy.hpp
+++ b/src/database/kernels/xaxpy.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyHalf = {
+const Database::DatabaseEntry XaxpyHalf = {
   "Xaxpy", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XaxpyHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpySingle = {
+const Database::DatabaseEntry XaxpySingle = {
   "Xaxpy", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -64,6 +65,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",1}, {"WGS",512}, {"WPT",2} } },
         { "Iris",                                            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",128}, {"WPT",2} } },
@@ -84,7 +86,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
         { "GeForce GTX 670",                                 { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",2}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",4}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",64}, {"WPT",1} } },
@@ -103,7 +105,7 @@ const Database::DatabaseEntry Database::XaxpySingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyComplexSingle = {
+const Database::DatabaseEntry XaxpyComplexSingle = {
   "Xaxpy", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -134,6 +136,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW",4}, {"WGS",64}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW",2}, {"WGS",512}, {"WPT",1} } },
         { "Iris",                                            { {"VW",2}, {"WGS",128}, {"WPT",1} } },
         { "Iris Pro",                                        { {"VW",1}, {"WGS",256}, {"WPT",8} } },
@@ -173,7 +176,7 @@ const Database::DatabaseEntry Database::XaxpyComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyDouble = {
+const Database::DatabaseEntry XaxpyDouble = {
   "Xaxpy", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -213,7 +216,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",128}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",2}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",512}, {"WPT",1} } },
@@ -232,7 +235,7 @@ const Database::DatabaseEntry Database::XaxpyDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XaxpyComplexDouble = {
+const Database::DatabaseEntry XaxpyComplexDouble = {
   "Xaxpy", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -272,7 +275,7 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
         { "GeForce GTX 670",                                 { {"VW",1}, {"WGS",256}, {"WPT",1} } },
         { "GeForce GTX 680",                                 { {"VW",1}, {"WGS",64}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",256}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"VW",1}, {"WGS",64}, {"WPT",2} } },
         { "GeForce GTX 980",                                 { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"VW",1}, {"WGS",64}, {"WPT",4} } },
         { "GeForce GTX TITAN X",                             { {"VW",1}, {"WGS",1024}, {"WPT",1} } },
@@ -290,4 +293,5 @@ const Database::DatabaseEntry Database::XaxpyComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xdot.hpp b/src/database/kernels/xdot.hpp
index 987a990d..686b2839 100644
--- a/src/database/kernels/xdot.hpp
+++ b/src/database/kernels/xdot.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotHalf = {
+const Database::DatabaseEntry XdotHalf = {
   "Xdot", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XdotHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotSingle = {
+const Database::DatabaseEntry XdotSingle = {
   "Xdot", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -55,6 +56,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WGS2",32} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",128} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",512}, {"WGS2",64} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",32} } },
@@ -68,6 +70,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",1024} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",128} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",1024}, {"WGS2",32} } },
@@ -84,7 +87,7 @@ const Database::DatabaseEntry Database::XdotSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotComplexSingle = {
+const Database::DatabaseEntry XdotComplexSingle = {
   "Xdot", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -106,6 +109,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",32}, {"WGS2",32} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",512}, {"WGS2",32} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",32}, {"WGS2",32} } },
         { "Iris Pro",                                        { {"WGS1",32}, {"WGS2",32} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",32} } },
@@ -119,6 +123,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -135,7 +140,7 @@ const Database::DatabaseEntry Database::XdotComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotDouble = {
+const Database::DatabaseEntry XdotDouble = {
   "Xdot", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -160,6 +165,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",256}, {"WGS2",32} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",256} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",128}, {"WGS2",64} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",512}, {"WGS2",32} } },
@@ -176,7 +182,7 @@ const Database::DatabaseEntry Database::XdotDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XdotComplexDouble = {
+const Database::DatabaseEntry XdotComplexDouble = {
   "Xdot", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -201,6 +207,7 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",512}, {"WGS2",128} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WGS2",64} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",32} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WGS2",32} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",128}, {"WGS2",32} } },
         { "Tesla K20m",                                      { {"WGS1",128}, {"WGS2",32} } },
@@ -216,4 +223,5 @@ const Database::DatabaseEntry Database::XdotComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemm.hpp b/src/database/kernels/xgemm.hpp
index d19c55b5..8303fa83 100644
--- a/src/database/kernels/xgemm.hpp
+++ b/src/database/kernels/xgemm.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmHalf = {
+const Database::DatabaseEntry XgemmHalf = {
   "Xgemm", Precision::kHalf, {
     { // Default
       kDeviceTypeAll, "default", {
@@ -26,7 +27,7 @@ const Database::DatabaseEntry Database::XgemmHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmSingle = {
+const Database::DatabaseEntry XgemmSingle = {
   "Xgemm", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -57,9 +58,10 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "Intel(R) HD Graphics 530",                        { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",8} } },
         { "Iris",                                            { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
-        { "Iris Pro",                                        { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
+        { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "default",                                         { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
       }
     },
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",1}, {"VWM",8}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",4}, {"VWN",8} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",4}, {"VWN",8} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemmSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmComplexSingle = {
+const Database::DatabaseEntry XgemmComplexSingle = {
   "Xgemm", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -127,6 +129,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"KWG",16}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"KWG",32}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",4}, {"VWN",1} } },
         { "Iris",                                            { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Iris Pro",                                        { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
@@ -147,7 +150,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
         { "GeForce GTX 670",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX 750",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",2} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",32}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",1} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",16}, {"NWG",64}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",1}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",1}, {"VWN",4} } },
@@ -166,7 +169,7 @@ const Database::DatabaseEntry Database::XgemmComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmDouble = {
+const Database::DatabaseEntry XgemmDouble = {
   "Xgemm", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -206,7 +209,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 680",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",32}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",8}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",1} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",4}, {"VWN",2} } },
         { "GeForce GTX 980",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",32}, {"NDIMC",32}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",2}, {"VWN",4} } },
         { "GeForce GTX TITAN",                               { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",128}, {"SA",1}, {"SB",1}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -225,7 +228,7 @@ const Database::DatabaseEntry Database::XgemmDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemmComplexDouble = {
+const Database::DatabaseEntry XgemmComplexDouble = {
   "Xgemm", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -265,7 +268,7 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
         { "GeForce GTX 670",                                 { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",64}, {"SA",1}, {"SB",0}, {"STRM",0}, {"STRN",1}, {"VWM",1}, {"VWN",2} } },
         { "GeForce GTX 680",                                 { {"KWG",16}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",64}, {"NDIMB",16}, {"NDIMC",32}, {"NWG",32}, {"SA",0}, {"SB",1}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 750",                                 { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",32}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
-        { "GeForce GTX 750 Ti",                              { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",4} } },
+        { "GeForce GTX 750 Ti",                              { {"KWG",32}, {"KWI",2}, {"MDIMA",8}, {"MDIMC",8}, {"MWG",16}, {"NDIMB",8}, {"NDIMC",8}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "GeForce GTX 980",                                 { {"KWG",16}, {"KWI",2}, {"MDIMA",16}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",8}, {"NDIMC",16}, {"NWG",128}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",1}, {"VWM",2}, {"VWN",2} } },
         { "GeForce GTX TITAN X",                             { {"KWG",32}, {"KWI",8}, {"MDIMA",16}, {"MDIMC",16}, {"MWG",128}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
         { "Tesla K20m",                                      { {"KWG",32}, {"KWI",2}, {"MDIMA",32}, {"MDIMC",8}, {"MWG",32}, {"NDIMB",16}, {"NDIMC",16}, {"NWG",64}, {"SA",0}, {"SB",0}, {"STRM",1}, {"STRN",0}, {"VWM",1}, {"VWN",1} } },
@@ -282,4 +285,5 @@ const Database::DatabaseEntry Database::XgemmComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemm_direct.hpp b/src/database/kernels/xgemm_direct.hpp
new file mode 100644
index 00000000..89499cc6
--- /dev/null
+++ b/src/database/kernels/xgemm_direct.hpp
@@ -0,0 +1,138 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Database generator <database.py>
+//
+// This file populates the database with best-found tuning parameters for the 'Xgemm_Direct' kernels.
+//
+// =================================================================================================
+
+namespace clblast {
+namespace database {
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectHalf = {
+  "XgemmDirect", Precision::kHalf, {
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectSingle = {
+  "XgemmDirect", Precision::kSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",4}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexSingle = {
+  "XgemmDirect", Precision::kComplexSingle, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Intel GPUs
+      kDeviceTypeGPU, "Intel", {
+        { "Iris Pro",                                        { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",16}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",16}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",2}, {"WGD",32} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectDouble = {
+  "XgemmDirect", Precision::kDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",2}, {"VWND",4}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+
+const Database::DatabaseEntry XgemmDirectComplexDouble = {
+  "XgemmDirect", Precision::kComplexDouble, {
+    { // AMD GPUs
+      kDeviceTypeGPU, "AMD", {
+        { "AMD Radeon R9 M370X Compute Engine",              { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",16}, {"MDIMCD",16}, {"NDIMBD",16}, {"NDIMCD",16}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",16} } },
+      }
+    },
+    { // NVIDIA GPUs
+      kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+        { "default",                                         { {"KWID",2}, {"MDIMAD",32}, {"MDIMCD",32}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",1}, {"WGD",32} } },
+      }
+    },
+    { // Default
+      kDeviceTypeAll, "default", {
+        { "default",                                         { {"KWID",2}, {"MDIMAD",8}, {"MDIMCD",8}, {"NDIMBD",8}, {"NDIMCD",8}, {"PADA",1}, {"PADB",1}, {"VWMD",1}, {"VWND",2}, {"WGD",16} } },
+      }
+    },
+  }
+};
+
+// =================================================================================================
+} // namespace database
+} // namespace clblast
diff --git a/src/database/kernels/xgemv.hpp b/src/database/kernels/xgemv.hpp
index e5e8845e..90355b96 100644
--- a/src/database/kernels/xgemv.hpp
+++ b/src/database/kernels/xgemv.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvHalf = {
+const Database::DatabaseEntry XgemvHalf = {
   "Xgemv", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvSingle = {
+const Database::DatabaseEntry XgemvSingle = {
   "Xgemv", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -57,6 +58,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",64}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",64}, {"WPT1",2} } },
         { "Iris Pro",                                        { {"WGS1",256}, {"WPT1",2} } },
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",256}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 980",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",256}, {"WPT1",1} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemvSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvComplexSingle = {
+const Database::DatabaseEntry XgemvComplexSingle = {
   "Xgemv", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",64}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WPT1",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WPT1",1} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",64}, {"WPT1",1} } },
         { "Iris",                                            { {"WGS1",256}, {"WPT1",1} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WPT1",1} } },
@@ -140,7 +143,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",128}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "default",                                         { {"WGS1",64}, {"WPT1",1} } },
       }
@@ -155,7 +158,7 @@ const Database::DatabaseEntry Database::XgemvComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvDouble = {
+const Database::DatabaseEntry XgemvDouble = {
   "Xgemv", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -188,7 +191,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WPT1",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WPT1",1} } },
-        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WPT1",1} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WPT1",1} } },
         { "GeForce GTX 980",                                 { {"WGS1",64}, {"WPT1",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",256}, {"WPT1",1} } },
         { "GeForce GTX TITAN X",                             { {"WGS1",64}, {"WPT1",1} } },
@@ -207,7 +210,7 @@ const Database::DatabaseEntry Database::XgemvDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvComplexDouble = {
+const Database::DatabaseEntry XgemvComplexDouble = {
   "Xgemv", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -249,4 +252,5 @@ const Database::DatabaseEntry Database::XgemvComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemv_fast.hpp b/src/database/kernels/xgemv_fast.hpp
index 52af628c..8e6254ac 100644
--- a/src/database/kernels/xgemv_fast.hpp
+++ b/src/database/kernels/xgemv_fast.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastHalf = {
+const Database::DatabaseEntry XgemvFastHalf = {
   "XgemvFast", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgemvFastHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastSingle = {
+const Database::DatabaseEntry XgemvFastSingle = {
   "XgemvFast", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -57,10 +58,11 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
         { "Iris Pro",                                        { {"VW2",1}, {"WGS2",128}, {"WPT2",2} } },
-        { "default",                                         { {"VW2",2}, {"WGS2",64}, {"WPT2",2} } },
+        { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",2} } },
       }
     },
     { // Intel accelerators
@@ -77,7 +79,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
         { "GeForce GTX 670",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 750",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",2}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
@@ -96,7 +98,7 @@ const Database::DatabaseEntry Database::XgemvFastSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
+const Database::DatabaseEntry XgemvFastComplexSingle = {
   "XgemvFast", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -120,6 +122,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW2",2}, {"WGS2",128}, {"WPT2",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW2",1}, {"WGS2",32}, {"WPT2",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris",                                            { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "Iris Pro",                                        { {"VW2",4}, {"WGS2",128}, {"WPT2",4} } },
@@ -139,7 +142,6 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
         { "GeForce GTX 480",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "default",                                         { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
       }
     },
@@ -153,7 +155,7 @@ const Database::DatabaseEntry Database::XgemvFastComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastDouble = {
+const Database::DatabaseEntry XgemvFastDouble = {
   "XgemvFast", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -186,7 +188,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
         { "GeForce GTX 670",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 680",                                 { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
         { "GeForce GTX 750",                                 { {"VW2",2}, {"WGS2",256}, {"WPT2",2} } },
-        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
+        { "GeForce GTX 750 Ti",                              { {"VW2",1}, {"WGS2",32}, {"WPT2",2} } },
         { "GeForce GTX 980",                                 { {"VW2",1}, {"WGS2",64}, {"WPT2",1} } },
         { "GeForce GTX TITAN",                               { {"VW2",1}, {"WGS2",256}, {"WPT2",1} } },
         { "GeForce GTX TITAN X",                             { {"VW2",1}, {"WGS2",128}, {"WPT2",1} } },
@@ -205,7 +207,7 @@ const Database::DatabaseEntry Database::XgemvFastDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
+const Database::DatabaseEntry XgemvFastComplexDouble = {
   "XgemvFast", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -247,4 +249,5 @@ const Database::DatabaseEntry Database::XgemvFastComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xgemv_fast_rot.hpp b/src/database/kernels/xgemv_fast_rot.hpp
index 328094e1..8fe45e01 100644
--- a/src/database/kernels/xgemv_fast_rot.hpp
+++ b/src/database/kernels/xgemv_fast_rot.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotHalf = {
+const Database::DatabaseEntry XgemvFastRotHalf = {
   "XgemvFastRot", Precision::kHalf, {
     { // Default
       kDeviceTypeAll, "default", {
@@ -26,7 +27,7 @@ const Database::DatabaseEntry Database::XgemvFastRotHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotSingle = {
+const Database::DatabaseEntry XgemvFastRotSingle = {
   "XgemvFastRot", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -44,6 +45,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",8}, {"WGS3",64}, {"WPT3",32} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",64}, {"WPT3",16} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",4}, {"WGS3",128}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
@@ -51,8 +53,9 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",8}, {"WGS3",32}, {"WPT3",32} } },
       }
     },
     { // Default
@@ -65,7 +68,7 @@ const Database::DatabaseEntry Database::XgemvFastRotSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
+const Database::DatabaseEntry XgemvFastRotComplexSingle = {
   "XgemvFastRot", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -83,6 +86,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
       kDeviceTypeGPU, "Intel", {
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"VW3",2}, {"WGS3",16}, {"WPT3",16} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"VW3",4}, {"WGS3",128}, {"WPT3",8} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"VW3",4}, {"WGS3",32}, {"WPT3",8} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"VW3",2}, {"WGS3",32}, {"WPT3",16} } },
         { "Iris Pro",                                        { {"VW3",4}, {"WGS3",16}, {"WPT3",16} } },
         { "default",                                         { {"VW3",2}, {"WGS3",32}, {"WPT3",8} } },
@@ -98,7 +102,7 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotDouble = {
+const Database::DatabaseEntry XgemvFastRotDouble = {
   "XgemvFastRot", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -114,8 +118,9 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
     },
     { // NVIDIA GPUs
       kDeviceTypeGPU, "NVIDIA", {
+        { "GeForce GTX 750 Ti",                              { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
         { "GeForce GTX TITAN",                               { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
-        { "default",                                         { {"VW3",1}, {"WGS3",16}, {"WPT3",16} } },
+        { "default",                                         { {"VW3",4}, {"WGS3",32}, {"WPT3",16} } },
       }
     },
     { // Default
@@ -128,7 +133,7 @@ const Database::DatabaseEntry Database::XgemvFastRotDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
+const Database::DatabaseEntry XgemvFastRotComplexDouble = {
   "XgemvFastRot", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -151,4 +156,5 @@ const Database::DatabaseEntry Database::XgemvFastRotComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/database/kernels/xger.hpp b/src/database/kernels/xger.hpp
index 3e9c25c1..f2fc2a9a 100644
--- a/src/database/kernels/xger.hpp
+++ b/src/database/kernels/xger.hpp
@@ -12,9 +12,10 @@
 // =================================================================================================
 
 namespace clblast {
+namespace database {
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerHalf = {
+const Database::DatabaseEntry XgerHalf = {
   "Xger", Precision::kHalf, {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
@@ -33,7 +34,7 @@ const Database::DatabaseEntry Database::XgerHalf = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerSingle = {
+const Database::DatabaseEntry XgerSingle = {
   "Xger", Precision::kSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -63,6 +64,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",8}, {"WGS2",8}, {"WPT",4} } },
         { "Iris Pro",                                        { {"WGS1",64}, {"WGS2",1}, {"WPT",4} } },
         { "default",                                         { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
@@ -76,8 +78,9 @@ const Database::DatabaseEntry Database::XgerSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",1}, {"WPT",4} } },
         { "GeForce GTX 750",                                 { {"WGS1",64}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",1}, {"WPT",4} } },
+        { "default",                                         { {"WGS1",128}, {"WGS2",1}, {"WPT",2} } },
       }
     },
     { // Default
@@ -90,7 +93,7 @@ const Database::DatabaseEntry Database::XgerSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerComplexSingle = {
+const Database::DatabaseEntry XgerComplexSingle = {
   "Xger", Precision::kComplexSingle, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -120,6 +123,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "Intel(R) HD Graphics 530",                        { {"WGS1",32}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics 5500 BroadWell U-Processor GT2", { {"WGS1",128}, {"WGS2",2}, {"WPT",1} } },
         { "Intel(R) HD Graphics Haswell Ultrabook GT2 Mobile", { {"WGS1",512}, {"WGS2",1}, {"WPT",1} } },
+        { "Intel(R) HD Graphics IvyBridge M GT2",            { {"WGS1",256}, {"WGS2",1}, {"WPT",2} } },
         { "Intel(R) HD Graphics Skylake ULT GT2",            { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "Iris Pro",                                        { {"WGS1",16}, {"WGS2",2}, {"WPT",4} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",1}, {"WPT",2} } },
@@ -133,6 +137,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
         { "GeForce GTX 670",                                 { {"WGS1",16}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",32}, {"WGS2",16}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",16}, {"WPT",2} } },
         { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
@@ -147,7 +152,7 @@ const Database::DatabaseEntry Database::XgerComplexSingle = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerDouble = {
+const Database::DatabaseEntry XgerDouble = {
   "Xger", Precision::kDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -180,8 +185,9 @@ const Database::DatabaseEntry Database::XgerDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",32}, {"WGS2",32}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",128}, {"WGS2",4}, {"WPT",2} } },
         { "GeForce GTX 750",                                 { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX TITAN",                               { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
-        { "default",                                         { {"WGS1",256}, {"WGS2",2}, {"WPT",2} } },
+        { "default",                                         { {"WGS1",64}, {"WGS2",2}, {"WPT",2} } },
       }
     },
     { // Default
@@ -194,7 +200,7 @@ const Database::DatabaseEntry Database::XgerDouble = {
 
 // =================================================================================================
 
-const Database::DatabaseEntry Database::XgerComplexDouble = {
+const Database::DatabaseEntry XgerComplexDouble = {
   "Xger", Precision::kComplexDouble, {
     { // AMD GPUs
       kDeviceTypeGPU, "AMD", {
@@ -227,6 +233,7 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
         { "GeForce GTX 670",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",2} } },
         { "GeForce GTX 680",                                 { {"WGS1",8}, {"WGS2",16}, {"WPT",1} } },
         { "GeForce GTX 750",                                 { {"WGS1",8}, {"WGS2",32}, {"WPT",4} } },
+        { "GeForce GTX 750 Ti",                              { {"WGS1",32}, {"WGS2",8}, {"WPT",2} } },
         { "GeForce GTX TITAN",                               { {"WGS1",32}, {"WGS2",4}, {"WPT",2} } },
         { "default",                                         { {"WGS1",16}, {"WGS2",8}, {"WPT",2} } },
       }
@@ -240,4 +247,5 @@ const Database::DatabaseEntry Database::XgerComplexDouble = {
 };
 
 // =================================================================================================
+} // namespace database
 } // namespace clblast
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index 223501fd..b0817242 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -204,7 +204,7 @@ R"(
 #if PRECISION == 3232 || PRECISION == 6464
   #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
 #else
-  #define COMPLEX_CONJUGATE(value) value = value
+  #define COMPLEX_CONJUGATE(value) 
 #endif
 
 // =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl
new file mode 100644
index 00000000..a8bd450e
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part1.opencl
@@ -0,0 +1,273 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any
+// pre and and post-processing kernels.
+//
+// This kernel is seperated into three files. This is part 1 out of 3.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library. Note that all parameters here have a
+// suffix 'D' to denote that they are for the 'direct' version of the GEMM kernel.
+#ifndef WGD
+  #define WGD 8      // Tile-size in dimension M, N, and K (e.g. 8, 16, 32, 64)
+#endif
+#ifndef MDIMCD
+  #define MDIMCD 8    // Threads per workgroup in M-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef NDIMCD
+  #define NDIMCD 8    // Threads per workgroup in N-dimension (e.g. 8, 16, 32)
+#endif
+#ifndef MDIMAD
+  #define MDIMAD 8    // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#endif
+#ifndef NDIMBD
+  #define NDIMBD 8    // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#endif
+#ifndef KWID
+  #define KWID 1      // Unroll factor of the WGD loop (smaller or equal than WGD)
+#endif
+#ifndef VWMD
+  #define VWMD 1      // Vector width of matrices A and C
+#endif
+#ifndef VWND
+  #define VWND 1      // Vector width of matrix B
+#endif
+#ifndef PADA
+  #define PADA 1      // Local memory padding for matrix A
+#endif
+#ifndef PADB
+  #define PADB 1      // Local memory padding for matrix B
+#endif
+
+// Helper parameters based on the above tuning parameters
+#define MWID (WGD/MDIMCD)                // Work per work-item (M-dimension)
+#define NWID (WGD/NDIMCD)                // Work per work-item (N-dimension)
+#define KDIMAD ((MDIMCD*NDIMCD)/(MDIMAD)) // Re-shaped tile dimension of matrix A: KDIMAD * MDIMAD
+#define KDIMBD ((MDIMCD*NDIMCD)/(NDIMBD)) // Re-shaped tile dimension of matrix B: KDIMBD * NDIMBD
+#define MWAD (WGD/MDIMAD)                // Amount of loads-per-thread for matrix A (M-dimension)
+#define KWAD (WGD/KDIMAD)                // Amount of loads-per-thread for matrix A (K-dimension)
+#define KWBD (WGD/KDIMBD)                // Amount of loads-per-thread for matrix B (K-dimension)
+#define NWBD (WGD/NDIMBD)                // Amount of loads-per-thread for matrix B (N-dimension)
+
+// =================================================================================================
+
+// Data-widths in dimension M
+#if VWMD == 1
+    typedef real realMD;
+#elif VWMD == 2
+    typedef real2 realMD;
+#elif VWMD == 4
+    typedef real4 realMD;
+#elif VWMD == 8
+    typedef real8 realMD;
+#elif VWMD == 16
+    typedef real16 realMD;
+#endif
+
+// Data-widths in dimension N
+#if VWND == 1
+    typedef real realND;
+#elif VWND == 2
+    typedef real2 realND;
+#elif VWND == 4
+    typedef real4 realND;
+#elif VWND == 8
+    typedef real8 realND;
+#elif VWND == 16
+    typedef real16 realND;
+#endif
+
+// =================================================================================================
+
+// Initializes the accumulation registers to zero
+inline void InitAccRegistersDirect(real cpm[NWID][MWID]) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    #pragma unroll
+    for (int ni=0; ni<NWID; ++ni) {
+      SetToZero(cpm[ni][mi]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Performs the actual computation: Cpm += Apm * Bpm
+inline void MultiplyAccumulateDirect(real cpm[NWID][MWID], real apm[MWID], real bpm[NWID]) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      MultiplyAdd(cpm[ni][mi], apm[mi], bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix.
+inline void GlobalToPrivateDirectA(const __global real* restrict agms, real apm[MWID],
+                                   const int a_ld, const int a_offset, const int idm, const int idk,
+                                   const int a_transpose, const int a_conjugate) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+    apm[mi] = agms[a_index + a_offset];
+    if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateDirectB(const __global real* restrict bgms, real bpm[NWID],
+                                   const int b_ld, const int b_offset, const int idn, const int idk,
+                                   const int b_transpose, const int b_conjugate) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+    bpm[ni] = bgms[b_index + b_offset];
+    if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+  }
+}
+
+// Loads global off-chip memory into thread-private register files. This function is specific for
+// loading the A input matrix. This is the same as above but now includes a bounds check.
+inline void GlobalToPrivateCheckedA(const __global real* restrict agms, real apm[MWID],
+                                    const int a_ld, const int a_offset, const int idm, const int idk,
+                                    const int a_transpose, const int a_conjugate,
+                                    const int kSizeM) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    if (idm + mi < kSizeM) {
+      const int a_index = (a_transpose) ? (idm + mi)*a_ld + idk : idk*a_ld + (idm + mi);
+      apm[mi] = agms[a_index + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(apm[mi]); }
+    }
+    else {
+      SetToZero(apm[mi]);
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToPrivateCheckedB(const __global real* restrict bgms, real bpm[NWID],
+                                    const int b_ld, const int b_offset, const int idn, const int idk,
+                                    const int b_transpose, const int b_conjugate,
+                                    const int kSizeN) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    if (idn + ni < kSizeN) {
+      const int b_index = (b_transpose) ? (idn + ni)*b_ld + idk : idk*b_ld + (idn + ni);
+      bpm[ni] = bgms[b_index + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(bpm[ni]); }
+    }
+    else {
+      SetToZero(bpm[ni]);
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches on-chip local memory into per-thread private memory (registers). This function is specific
+// for caching the A input matrix.
+inline void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg,
+                                  const int a_transpose) {
+  #pragma unroll
+  for (int mi=0; mi<MWID; ++mi) {
+    const int mg = mi + get_local_id(0)*MWID;
+    const int index = (a_transpose) ? mg*(WGD + PADA) + kg : kg*(WGD + PADA) + mg;
+    apm[mi] = alm[index];
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void LocalToPrivateDirectB(__local real* blm, real bpm[NWID], const int kg,
+                                  const int b_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    const int ng = ni + get_local_id(1)*NWID;
+    const int index = (b_transpose) ? ng*(WGD + PADB) + kg : kg*(WGD + PADB) + ng;
+    bpm[ni] = blm[index];
+  }
+}
+
+// =================================================================================================
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsDirect(__global real* cgm, real cpm[NWID][MWID],
+                               const int idm, const int idn,
+                               const real alpha, const real beta,
+                               const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+
+      // Determines the destination index
+      int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+      // The final multiplication with alpha (in case beta == 0)
+      real result;
+      if (IsZero(beta)) {
+        Multiply(result, alpha, cpm[ni][mi]);
+      }
+      // The final multiplication with alpha and the addition with beta*C
+      else {
+        AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+      }
+      cgm[c_index + c_offset] = result;
+    }
+  }
+}
+
+// Merges the results in Cpm with the global array in Cgm. This also performs the multiplication
+// with the constants: Cgm = alpha*A*B + beta*Cgm = alpha*Cpm + beta*Cgm
+inline void StoreResultsChecked(__global real* cgm, real cpm[NWID][MWID],
+                                const int idm, const int idn, const int kSizeM, const int kSizeN,
+                                const real alpha, const real beta,
+                                const int c_ld, const int c_offset, const int c_transpose) {
+  #pragma unroll
+  for (int ni=0; ni<NWID; ++ni) {
+    #pragma unroll
+    for (int mi=0; mi<MWID; ++mi) {
+      if ((idm + mi) < kSizeM && (idn + ni) < kSizeN) {
+
+        // Determines the destination index
+        int c_index = (c_transpose) ? (idm + mi)*c_ld + (idn + ni) : (idn + ni)*c_ld + (idm + mi);
+
+        // The final multiplication with alpha (in case beta == 0)
+        real result;
+        if (IsZero(beta)) {
+          Multiply(result, alpha, cpm[ni][mi]);
+        }
+        // The final multiplication with alpha and the addition with beta*C
+        else {
+          AXPBY(result, alpha, cpm[ni][mi], beta, cgm[c_index + c_offset]);
+        }
+        cgm[c_index + c_offset] = result;
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part2.opencl b/src/kernels/level3/xgemm_direct_part2.opencl
new file mode 100644
index 00000000..d77cbf65
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part2.opencl
@@ -0,0 +1,314 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 2 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix.
+inline void GlobalToLocalDirectA(const __global realMD* restrict agm, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD/VWMD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*(MWAD/VWMD);
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg/VWMD : mg + GetGroupID0()*(WGD/VWMD);
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realMD avec = agm[idk*(a_ld/VWMD) + idm + a_offset];
+      #if VWMD == 1
+         alm[kg*(WGD + PADA) + mg] = avec;
+      #elif VWMD == 2
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+      #elif VWMD == 4
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.x;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.y;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.z;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.w;
+      #elif VWMD == 8
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+      #elif VWMD == 16
+         alm[kg*(WGD + PADA) + mg*VWMD + 0] = avec.s0;
+         alm[kg*(WGD + PADA) + mg*VWMD + 1] = avec.s1;
+         alm[kg*(WGD + PADA) + mg*VWMD + 2] = avec.s2;
+         alm[kg*(WGD + PADA) + mg*VWMD + 3] = avec.s3;
+         alm[kg*(WGD + PADA) + mg*VWMD + 4] = avec.s4;
+         alm[kg*(WGD + PADA) + mg*VWMD + 5] = avec.s5;
+         alm[kg*(WGD + PADA) + mg*VWMD + 6] = avec.s6;
+         alm[kg*(WGD + PADA) + mg*VWMD + 7] = avec.s7;
+         alm[kg*(WGD + PADA) + mg*VWMD + 8] = avec.s8;
+         alm[kg*(WGD + PADA) + mg*VWMD + 9] = avec.s9;
+         alm[kg*(WGD + PADA) + mg*VWMD + 10] = avec.sA;
+         alm[kg*(WGD + PADA) + mg*VWMD + 11] = avec.sB;
+         alm[kg*(WGD + PADA) + mg*VWMD + 12] = avec.sC;
+         alm[kg*(WGD + PADA) + mg*VWMD + 13] = avec.sD;
+         alm[kg*(WGD + PADA) + mg*VWMD + 14] = avec.sE;
+         alm[kg*(WGD + PADA) + mg*VWMD + 15] = avec.sF;
+      #endif
+      if (a_conjugate) {
+        for (int vm=0; vm<VWMD; ++vm) {
+          COMPLEX_CONJUGATE(alm[kg*(WGD + PADA) + mg*VWMD + vm]);
+        }
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalDirectB(const __global realND* restrict bgm, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD/VWND; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*(NWBD/VWND);
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg/VWND : ng + GetGroupID1()*(WGD/VWND);
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      const realND bvec = bgm[idk*(b_ld/VWND) + idn + b_offset];
+      #if VWND == 1
+         blm[kg*(WGD + PADB) + ng] = bvec;
+      #elif VWND == 2
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+      #elif VWND == 4
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.x;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.y;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.z;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.w;
+      #elif VWND == 8
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+      #elif VWND == 16
+         blm[kg*(WGD + PADB) + ng*VWND + 0] = bvec.s0;
+         blm[kg*(WGD + PADB) + ng*VWND + 1] = bvec.s1;
+         blm[kg*(WGD + PADB) + ng*VWND + 2] = bvec.s2;
+         blm[kg*(WGD + PADB) + ng*VWND + 3] = bvec.s3;
+         blm[kg*(WGD + PADB) + ng*VWND + 4] = bvec.s4;
+         blm[kg*(WGD + PADB) + ng*VWND + 5] = bvec.s5;
+         blm[kg*(WGD + PADB) + ng*VWND + 6] = bvec.s6;
+         blm[kg*(WGD + PADB) + ng*VWND + 7] = bvec.s7;
+         blm[kg*(WGD + PADB) + ng*VWND + 8] = bvec.s8;
+         blm[kg*(WGD + PADB) + ng*VWND + 9] = bvec.s9;
+         blm[kg*(WGD + PADB) + ng*VWND + 10] = bvec.sA;
+         blm[kg*(WGD + PADB) + ng*VWND + 11] = bvec.sB;
+         blm[kg*(WGD + PADB) + ng*VWND + 12] = bvec.sC;
+         blm[kg*(WGD + PADB) + ng*VWND + 13] = bvec.sD;
+         blm[kg*(WGD + PADB) + ng*VWND + 14] = bvec.sE;
+         blm[kg*(WGD + PADB) + ng*VWND + 15] = bvec.sF;
+      #endif
+      if (b_conjugate) {
+        for (int vn=0; vn<VWND; ++vn) {
+          COMPLEX_CONJUGATE(blm[kg*(WGD + PADB) + ng*VWND + vn]);
+        }
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs doesn't
+// use the vector data-types.
+inline void GlobalToLocalScalarA(const __global real* restrict agms, __local real* alm,
+                                 const int a_ld, const int a_offset, const int kwg,
+                                 const int a_transpose, const int a_conjugate) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = agms[idk*a_ld + idm + a_offset];
+      if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+      alm[kg*(WGD + PADA) + mg] = result;
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalScalarB(const __global real* restrict bgms, __local real* blm,
+                                 const int b_ld, const int b_offset, const int kwg,
+                                 const int b_transpose, const int b_conjugate) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      real result = bgms[idk*b_ld + idn + b_offset];
+      if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+      blm[kg*(WGD + PADB) + ng] = result;
+    }
+  }
+}
+
+// =================================================================================================
+
+// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for
+// caching the A input matrix. In contrast to the functions above, this function performs bounds
+// checks and doesn't use the vector data-types.
+inline void GlobalToLocalCheckedA(const __global real* restrict agms, __local real* alm,
+                                  const int a_ld, const int a_offset, const int kwg,
+                                  const int a_transpose, const int a_conjugate,
+                                  const int kSizeM, const int kSizeK) {
+  #if MDIMCD == MDIMAD
+    const int la0 = get_local_id(0);
+    const int la1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int la0 = tid % MDIMAD;
+    const int la1 = tid / MDIMAD;
+  #endif
+  #pragma unroll
+  for (int mia=0; mia<MWAD; ++mia) {
+    #pragma unroll
+    for (int kia=0; kia<KWAD; ++kia) {
+
+      // Computes the indices for the global memory
+      int mg = mia + la0*MWAD;
+      int kg = kia + la1*KWAD;
+      int idm = (a_transpose) ? mg + kwg : mg + GetGroupID0()*WGD;
+      int idk = (a_transpose) ? kg + GetGroupID0()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (a_transpose) ? idm < kSizeK : idm < kSizeM;
+      if (condition) {
+        real result = agms[idk*a_ld + idm + a_offset];
+        if (a_conjugate) { COMPLEX_CONJUGATE(result); }
+        alm[kg*(WGD + PADA) + mg] = result;
+      }
+      else {
+        SetToZero(alm[kg*(WGD + PADA) + mg]);
+      }
+    }
+  }
+}
+
+// Same as above, but now for the B input matrix
+inline void GlobalToLocalCheckedB(const __global real* restrict bgms, __local real* blm,
+                                  const int b_ld, const int b_offset, const int kwg,
+                                  const int b_transpose, const int b_conjugate,
+                                  const int kSizeN, const int kSizeK) {
+  #if MDIMCD == NDIMBD
+    const int lb0 = get_local_id(0);
+    const int lb1 = get_local_id(1);
+  #else
+    const int tid = get_local_id(0) + MDIMCD*get_local_id(1);
+    const int lb0 = tid % NDIMBD;
+    const int lb1 = tid / NDIMBD;
+  #endif
+  #pragma unroll
+  for (int kib=0; kib<KWBD; ++kib) {
+    #pragma unroll
+    for (int nib=0; nib<NWBD; ++nib) {
+
+      // Computes the indices for the global memory
+      int ng = nib + lb0*NWBD;
+      int kg = kib + lb1*KWBD;
+      int idn = (b_transpose) ? ng + kwg : ng + GetGroupID1()*WGD;
+      int idk = (b_transpose) ? kg + GetGroupID1()*WGD : kg + kwg;
+
+      // Loads the data from global memory into the local memory
+      int condition = (b_transpose) ? idn < kSizeK : idn < kSizeN;
+      if (condition) {
+        real result = bgms[idk*b_ld + idn + b_offset];
+        if (b_conjugate) { COMPLEX_CONJUGATE(result); }
+        blm[kg*(WGD + PADB) + ng] = result;
+      }
+      else {
+        SetToZero(blm[kg*(WGD + PADB) + ng]);
+      }
+    }
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/kernels/level3/xgemm_direct_part3.opencl b/src/kernels/level3/xgemm_direct_part3.opencl
new file mode 100644
index 00000000..a9350e00
--- /dev/null
+++ b/src/kernels/level3/xgemm_direct_part3.opencl
@@ -0,0 +1,214 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This is part 3 of 3 of the GEMM kernel. See part 1 for more information.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Main body of the kernel. This is the direct version without pre/post processing and restrictions.
+inline void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK,
+                        const real_arg arg_alpha,
+                        const real_arg arg_beta,
+                        const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                        const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                        __global real* cgm, const int c_offset, const int c_ld,
+                        __local real* alm, __local real* blm,
+                        const int a_transpose, const int b_transpose, const int c_transpose,
+                        const int a_conjugate, const int b_conjugate) {
+  const real alpha = GetRealArg(arg_alpha);
+  const real beta = GetRealArg(arg_beta);
+
+  // Extra pointers to scalar versions of global memory
+  const __global real* restrict agms = (const __global real* restrict) agm;
+  const __global real* restrict bgms = (const __global real* restrict) bgm;
+
+  // Allocates workitem-private memory (registers)
+  real apm[MWID];
+  real bpm[NWID];
+  real cpm[NWID][MWID];
+
+  // Initializes the accumulation registers
+  InitAccRegistersDirect(cpm);
+
+  // The faster version of GEMM is not allowed on the (incomplete) borders. Therefore, this section
+  // processes only the main parts: output blocks of WGD by WGD.
+  const int idm = get_local_id(0) * MWID + GetGroupID0() * WGD;
+  const int idn = get_local_id(1) * NWID + GetGroupID1() * WGD;
+  if ((idm < (kSizeM/WGD)*WGD) && (idn < (kSizeN/WGD)*WGD)) {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      if (a_ld % VWMD == 0) {
+        GlobalToLocalDirectA(agm, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      else {
+        GlobalToLocalScalarA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate);
+      }
+      if (b_ld % VWND == 0) {
+        GlobalToLocalDirectB(bgm, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      else {
+        GlobalToLocalScalarB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate);
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateDirectA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate);
+      GlobalToPrivateDirectB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsDirect(cgm, cpm, idm, idn, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+
+  // Simple but slower version for the parts on the edge (incomplete tiles in M and N-dimensions)
+  else {
+
+    // Loops over all complete workgroup tiles (K-dimension)
+    int kwg = 0;
+    for (; kwg < (kSizeK/WGD) * WGD; kwg+=WGD) {
+
+      // Loads data: off-chip --> local (matrix A and B)
+      GlobalToLocalCheckedA(agms, alm, a_ld, a_offset, kwg, a_transpose, a_conjugate, kSizeM, kSizeK);
+      GlobalToLocalCheckedB(bgms, blm, b_ld, b_offset, kwg, b_transpose, b_conjugate, kSizeN, kSizeK);
+      barrier(CLK_LOCAL_MEM_FENCE);
+
+      // Loops over all workitem tiles, unrolled by a factor KWID
+      for (int pwi=0; pwi<WGD; pwi+=KWID) {
+        #pragma unroll
+        for (int pit=0; pit<KWID; ++pit) {
+          int kg = pwi + pit;
+
+          // Loads data: local --> private (matrix A and B)
+          LocalToPrivateDirectA(alm, apm, kg, a_transpose);
+          LocalToPrivateDirectB(blm, bpm, kg, b_transpose);
+
+          // Performs the accumulation (Cpm += Apm * Bpm)
+          MultiplyAccumulateDirect(cpm, apm, bpm);
+        }
+      }
+      barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Loop over the remaining part (incomplete tile in K-dimension)
+    for (; kwg < kSizeK; ++kwg) {
+
+      // Loads data: off-chip --> private (matrix A and B)
+      GlobalToPrivateCheckedA(agms, apm, a_ld, a_offset, idm, kwg, a_transpose, a_conjugate, kSizeM);
+      GlobalToPrivateCheckedB(bgms, bpm, b_ld, b_offset, idn, kwg, b_transpose, b_conjugate, kSizeN);
+
+      // Performs the accumulation (Cpm += Apm * Bpm)
+      MultiplyAccumulateDirect(cpm, apm, bpm);
+    }
+
+    // Stores a tile of results and performs the multiplication with alpha and beta
+    StoreResultsChecked(cgm, cpm, idm, idn, kSizeM, kSizeN, alpha, beta, c_ld, c_offset, c_transpose);
+  }
+}
+
+// =================================================================================================
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [non-transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectNT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 0, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, non-transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTN(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 0, c_transpose, a_conjugate, b_conjugate);
+}
+
+// Direct version of the GEMM kernel with [A, B] = [transposed, transposed]
+__attribute__((reqd_work_group_size(MDIMCD, NDIMCD, 1)))
+__kernel void XgemmDirectTT(const int kSizeM, const int kSizeN, const int kSizeK,
+                            const real_arg arg_alpha, const real_arg arg_beta,
+                            const __global realMD* restrict agm, const int a_offset, const int a_ld,
+                            const __global realND* restrict bgm, const int b_offset, const int b_ld,
+                            __global real* cgm, const int c_offset, const int c_ld,
+                            const int c_transpose, const int a_conjugate, const int b_conjugate) {
+  __local real alm[WGD * (WGD + PADA)];
+  __local real blm[WGD * (WGD + PADB)];
+  XgemmDirect(kSizeM, kSizeN, kSizeK, arg_alpha, arg_beta,
+              agm, a_offset, a_ld, bgm, b_offset, b_ld, cgm, c_offset, c_ld,
+              alm, blm, 1, 1, c_transpose, a_conjugate, b_conjugate);
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================
diff --git a/src/msvc.hpp b/src/msvc.hpp
new file mode 100644
index 00000000..a45105df
--- /dev/null
+++ b/src/msvc.hpp
@@ -0,0 +1,39 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file provides macro's and definitions to make compilation work on Microsoft Visual Studio,
+// in particular for versions older than 2015 with limited C++11 support.
+// MSVC++ 14.0 _MSC_VER == 1900 (Visual Studio 2015)
+// MSVC++ 12.0 _MSC_VER == 1800 (Visual Studio 2013)
+// MSVC++ 11.0 _MSC_VER == 1700 (Visual Studio 2012)
+// MSVC++ 10.0 _MSC_VER == 1600 (Visual Studio 2010)
+// MSVC++ 9.0  _MSC_VER == 1500 (Visual Studio 2008)
+//
+// =================================================================================================
+
+#ifndef CLBLAST_MSVC_HPP_
+#define CLBLAST_MSVC_HPP_
+
+namespace clblast {
+// =================================================================================================
+#ifdef _MSC_VER
+
+// No support for constexpr prior to 2015. Note that this only works with constants, not with
+// constexpr functions (unused in this project).
+#if _MSC_VER < 1900
+#define constexpr const
+#endif
+
+// _MSC_VER
+#endif
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_MSVC_HPP_
+#endif
diff --git a/src/routine.cpp b/src/routine.cpp
index 189ae190..80764b74 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -14,6 +14,7 @@
 #include <string>
 #include <vector>
 #include <chrono>
+#include <cstdlib>
 
 #include "routine.hpp"
 
@@ -23,7 +24,7 @@ namespace clblast {
 // Constructor: not much here, because no status codes can be returned
 Routine::Routine(Queue &queue, EventPointer event, const std::string &name,
                  const std::vector<std::string> &routines, const Precision precision,
-                 const std::vector<Database::DatabaseEntry> &userDatabase):
+                 const std::vector<const Database::DatabaseEntry*> &userDatabase):
     precision_(precision),
     routine_name_(name),
     queue_(queue),
@@ -42,13 +43,19 @@ StatusCode Routine::SetUp() {
   // Queries the cache to see whether or not the program (context-specific) is already there
   if (ProgramIsInCache(context_, precision_, routine_name_)) { return StatusCode::kSuccess; }
 
+  // Sets the build options from an environmental variable (if set)
+  auto options = std::vector<std::string>();
+  const auto environment_variable = std::getenv("CLBLAST_BUILD_OPTIONS");
+  if (environment_variable != nullptr) {
+    options.push_back(std::string(environment_variable));
+  }
+
   // Queries the cache to see whether or not the binary (device-specific) is already there. If it
   // is, a program is created and stored in the cache
   if (BinaryIsInCache(device_name_, precision_, routine_name_)) {
     try {
       auto& binary = GetBinaryFromCache(device_name_, precision_, routine_name_);
       auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
       program.Build(device_, options);
       StoreProgramToCache(program, context_, precision_, routine_name_);
     } catch (...) { return StatusCode::kBuildProgramFailure; }
@@ -115,7 +122,6 @@ StatusCode Routine::SetUp() {
   // Compiles the kernel
   try {
     auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
     const auto build_status = program.Build(device_, options);
 
     // Checks for compiler crashes/errors/warnings
diff --git a/src/routine.hpp b/src/routine.hpp
index f5c607af..8582a2b7 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -36,7 +36,7 @@ class Routine {
   // built-in database.
   explicit Routine(Queue &queue, EventPointer event, const std::string &name,
                    const std::vector<std::string> &routines, const Precision precision,
-                   const std::vector<Database::DatabaseEntry> &userDatabase = {});
+                   const std::vector<const Database::DatabaseEntry*> &userDatabase = {});
 
   // Set-up phase of the kernel
   StatusCode SetUp();
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 0b8e768f..1602c69f 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -22,7 +22,9 @@ namespace clblast {
 // Constructor: forwards to base class constructor
 template <typename T>
 Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
-    Routine(queue, event, name, {"Copy","Pad","Transpose","Padtranspose","Xgemm"}, PrecisionValue<T>()) {
+    Routine(queue, event, name,
+            {"Copy","Pad","Transpose","Padtranspose","Xgemm","XgemmDirect","KernelSelection"},
+            PrecisionValue<T>()) {
   source_string_ =
     #include "../../kernels/level3/level3.opencl"
     #include "../../kernels/level3/copy_fast.opencl"
@@ -32,10 +34,16 @@ Xgemm<T>::Xgemm(Queue &queue, EventPointer event, const std::string &name):
     #include "../../kernels/level3/convert_symmetric.opencl"
     #include "../../kernels/level3/convert_triangular.opencl"
     #include "../../kernels/level3/convert_hermitian.opencl"
+    #include "../../kernels/level3/xgemm_direct_part1.opencl"
+    #include "../../kernels/level3/xgemm_direct_part2.opencl"
+    #include "../../kernels/level3/xgemm_direct_part3.opencl"
+  ;
+  auto source_string_part_2 = // separated in two parts to prevent C1091 in MSVC 2013
     #include "../../kernels/level3/xgemm_part1.opencl"
     #include "../../kernels/level3/xgemm_part2.opencl"
     #include "../../kernels/level3/xgemm_part3.opencl"
   ;
+  source_string_ += source_string_part_2;
 }
 
 // =================================================================================================
@@ -98,6 +106,44 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld);
   if (ErrorIn(status)) { return status; }
 
+  // Selects which version of GEMM to run
+  const auto do_gemm_direct = (m * n * k < db_["XGEMM_MIN_INDIRECT_SIZE"]);
+  if (do_gemm_direct) { // for small sizes (single kernel)
+    return GemmDirect(m, n, k, alpha,
+                      a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                      c_buffer, c_offset, c_ld,
+                      a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate);
+  }
+  else { // for larger sizes (pre/post-processing plus a very fast kernel)
+    return GemmIndirect(m, n, k, alpha,
+                        a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta,
+                        c_buffer, c_offset, c_ld,
+                        a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate,
+                        a_one, a_two, a_want_rotated,
+                        b_one, b_two, b_want_rotated,
+                        c_one, c_two, c_want_rotated);
+  }
+}
+
+// =================================================================================================
+
+// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific
+// requirements, but several pre and post-processing kernels take care of those. However, the
+// overhead of these extra kernels might not be ideal for certain devices/arguments.
+template <typename T>
+StatusCode Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
+                                  const T alpha,
+                                  const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                  const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                                  const T beta,
+                                  const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                                  const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                  const bool a_conjugate, const bool b_conjugate,
+                                  const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                                  const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                                  const size_t c_one, const size_t c_two, const bool c_want_rotated) {
+  auto status = StatusCode::kSuccess;
+
   // Calculates the ceiled versions of m, n, and k
   const auto m_ceiled = Ceil(m, db_["MWG"]);
   const auto n_ceiled = Ceil(n, db_["NWG"]);
@@ -217,6 +263,66 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
   } catch (...) { return StatusCode::kTempBufferAllocFailure; }
 }
 
+
+// =================================================================================================
+
+// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels.
+template <typename T>
+StatusCode Xgemm<T>::GemmDirect(const size_t m, const size_t n, const size_t k,
+                                const T alpha,
+                                const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                                const T beta,
+                                const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                                const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                                const bool a_conjugate, const bool b_conjugate) {
+
+  // Loads the program from the database
+  const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_);
+
+  // Retrieves the proper XgemmDirect kernel from the compiled binary
+  try {
+    const auto name = (a_do_transpose) ? (b_do_transpose ? "XgemmDirectTT" : "XgemmDirectTN") :
+                                         (b_do_transpose ? "XgemmDirectNT" : "XgemmDirectNN");
+    auto kernel = Kernel(program, name);
+
+    // Sets the kernel arguments
+    kernel.SetArgument(0, static_cast<int>(m));
+    kernel.SetArgument(1, static_cast<int>(n));
+    kernel.SetArgument(2, static_cast<int>(k));
+    kernel.SetArgument(3, GetRealArg(alpha));
+    kernel.SetArgument(4, GetRealArg(beta));
+    kernel.SetArgument(5, a_buffer());
+    kernel.SetArgument(6, static_cast<int>(a_offset));
+    kernel.SetArgument(7, static_cast<int>(a_ld));
+    kernel.SetArgument(8, b_buffer());
+    kernel.SetArgument(9, static_cast<int>(b_offset));
+    kernel.SetArgument(10, static_cast<int>(b_ld));
+    kernel.SetArgument(11, c_buffer());
+    kernel.SetArgument(12, static_cast<int>(c_offset));
+    kernel.SetArgument(13, static_cast<int>(c_ld));
+    kernel.SetArgument(14, static_cast<int>(c_do_transpose));
+    kernel.SetArgument(15, static_cast<int>(a_conjugate));
+    kernel.SetArgument(16, static_cast<int>(b_conjugate));
+
+    // Computes the global and local thread sizes
+    const auto m_ceiled = Ceil(m, db_["WGD"]);
+    const auto n_ceiled = Ceil(n, db_["WGD"]);
+    const auto global = std::vector<size_t>{
+      (m_ceiled * db_["MDIMCD"]) / db_["WGD"],
+      (n_ceiled * db_["NDIMCD"]) / db_["WGD"]
+    };
+    const auto local = std::vector<size_t>{db_["MDIMCD"], db_["NDIMCD"]};
+
+    // Launches the kernel
+    auto status = RunKernel(kernel, queue_, device_, global, local, event_);
+    if (ErrorIn(status)) { return status; }
+
+    // Successfully finished the computation
+    return StatusCode::kSuccess;
+  } catch (...) { return StatusCode::kInvalidKernel; }
+}
+
 // =================================================================================================
 
 // Compiles the templated class
diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp
index bc51c7f5..46e12453 100644
--- a/src/routines/level3/xgemm.hpp
+++ b/src/routines/level3/xgemm.hpp
@@ -35,6 +35,29 @@ class Xgemm: public Routine {
                     const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
                     const T beta,
                     const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld);
+
+  // Indirect version of GEMM (with pre and post-processing kernels)
+  StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k,
+                          const T alpha,
+                          const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                          const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                          const T beta,
+                          const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                          const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                          const bool a_conjugate, const bool b_conjugate,
+                          const size_t a_one, const size_t a_two, const bool a_want_rotated,
+                          const size_t b_one, const size_t b_two, const bool b_want_rotated,
+                          const size_t c_one, const size_t c_two, const bool c_want_rotated);
+
+  // Direct version of GEMM (no pre and post-processing kernels)
+  StatusCode GemmDirect(const size_t m, const size_t n, const size_t k,
+                        const T alpha,
+                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
+                        const T beta,
+                        const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld,
+                        const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose,
+                        const bool a_conjugate, const bool b_conjugate);
 };
 
 // =================================================================================================
diff --git a/src/tuning/kernels/copy_fast.cpp b/src/tuning/kernels/copy_fast.cpp
index 78ded56e..c57aab39 100644
--- a/src/tuning/kernels/copy_fast.cpp
+++ b/src/tuning/kernels/copy_fast.cpp
@@ -47,6 +47,7 @@ class TuneCopy {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/copy_pad.cpp b/src/tuning/kernels/copy_pad.cpp
index 90f5ea82..9486ee8d 100644
--- a/src/tuning/kernels/copy_pad.cpp
+++ b/src/tuning/kernels/copy_pad.cpp
@@ -47,6 +47,7 @@ class TunePad {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_fast.cpp b/src/tuning/kernels/transpose_fast.cpp
index 10fa80cb..2d9d5e49 100644
--- a/src/tuning/kernels/transpose_fast.cpp
+++ b/src/tuning/kernels/transpose_fast.cpp
@@ -47,6 +47,7 @@ class TuneTranspose {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/transpose_pad.cpp b/src/tuning/kernels/transpose_pad.cpp
index 507718eb..d364dabe 100644
--- a/src/tuning/kernels/transpose_pad.cpp
+++ b/src/tuning/kernels/transpose_pad.cpp
@@ -47,6 +47,7 @@ class TunePadTranspose {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
diff --git a/src/tuning/kernels/xaxpy.cpp b/src/tuning/kernels/xaxpy.cpp
index 0033b3c6..403ee9e4 100644
--- a/src/tuning/kernels/xaxpy.cpp
+++ b/src/tuning/kernels/xaxpy.cpp
@@ -51,6 +51,7 @@ class TuneXaxpy {
   static size_t DefaultN() { return 4096*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xdot.cpp b/src/tuning/kernels/xdot.cpp
index 1581e13f..f8416761 100644
--- a/src/tuning/kernels/xdot.cpp
+++ b/src/tuning/kernels/xdot.cpp
@@ -47,6 +47,7 @@ class TuneXdot {
   static size_t DefaultN() { return 2*1024*1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 4cb7fd00..0eb1875b 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -52,6 +52,7 @@ class TuneXgemm {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1024; }
   static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
@@ -126,10 +127,10 @@ class TuneXgemm {
   // Sets the local memory size
   static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
     auto LocalMemorySize = [args] (std::vector<size_t> v) {
-      return (((v[0]*v[1]*v[2]/v[3]) + (v[4]*v[5]*v[6]/v[7]))*GetBytes(args.precision));
+      return (((v[0]*v[1]*v[2]) + (v[3]*v[4]*v[5]))*GetBytes(args.precision));
     };
-    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG", "VWM",
-                                                    "SB", "KWG", "NWG", "VWN"});
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"SA", "KWG", "MWG",
+                                                    "SB", "KWG", "NWG"});
   }
 
   // Sets the base thread configuration
diff --git a/src/tuning/kernels/xgemm_direct.cpp b/src/tuning/kernels/xgemm_direct.cpp
new file mode 100644
index 00000000..204e0be4
--- /dev/null
+++ b/src/tuning/kernels/xgemm_direct.cpp
@@ -0,0 +1,196 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the CLTune auto-tuner to tune the direct xgemm kernels. There are two variations:
+// - V==1: This tests some limited set of tuning parameters exhaustively.
+// - V==2: This tests a much larger set of tuning parameters by randomly sampling a subset.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T, int V>
+class TuneXgemmDirect {
+ public:
+
+  // The representative kernel and the source code
+  static std::string KernelFamily() { return (V==1) ? "xgemm_direct_1" : "xgemm_direct_2"; }
+  static std::string KernelName() { return "XgemmDirectTN"; }
+  static std::string GetSources() {
+    return
+      #include "../src/kernels/common.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part1.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part2.opencl"
+      #include "../src/kernels/level3/xgemm_direct_part3.opencl"
+    ;
+  }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction};
+  }
+
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
+  // Sets the default values for the arguments
+  static size_t DefaultM() { return 256; }
+  static size_t DefaultN() { return 256; }
+  static size_t DefaultK() { return 256; }
+  static double DefaultFraction() { return (V==1) ? 1.0 : 32.0; } // test all or sample randomly
+  static size_t DefaultNumRuns() { return 4; } // run every kernel this many times for averaging
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
+  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; }
+  static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; }
+  static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; }
+  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
+
+  // Sets the tuning parameters and their possible values
+  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
+      tuner.AddParameter(id, "WGD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+      tuner.AddParameter(id, "KWID", {2});
+      tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+      tuner.AddParameter(id, "PADA", {1});
+      tuner.AddParameter(id, "PADB", {1});
+    } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+    else {
+      tuner.AddParameter(id, "WGD", {8, 16, 32, 64, 128});
+      tuner.AddParameter(id, "MDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMCD", {8, 16, 32});
+      tuner.AddParameter(id, "MDIMAD", {8, 16, 32});
+      tuner.AddParameter(id, "NDIMBD", {8, 16, 32});
+      tuner.AddParameter(id, "KWID", {2, 8, 16});
+      tuner.AddParameter(id, "VWMD", {1, 2, 4, 8});
+      tuner.AddParameter(id, "VWND", {1, 2, 4, 8});
+      tuner.AddParameter(id, "PADA", {0, 1});
+      tuner.AddParameter(id, "PADB", {0, 1});
+    }
+  }
+
+  // Sets the constraints
+  static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
+    auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
+    auto MultipleOfXMulY = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]*v[2]); };
+    auto MultipleOfXMulYDivZ = [] (std::vector<size_t> v) { return IsMultiple(v[0], (v[1]*v[2])/v[3]); };
+    // Requirement for unrolling the WGD loop
+    tuner.AddConstraint(id, MultipleOfX, {"WGD", "KWID"});
+    // Required for integer MWID and NWID
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMCD", "VWMD"});
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMCD", "VWND"});
+    // Required for integer MWIAD and NWIBD
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "MDIMAD", "VWMD"});
+    tuner.AddConstraint(id, MultipleOfXMulY, {"WGD", "NDIMBD", "VWND"});
+    // WGD has to be a multiple of KDIMAD = ((MDIMCD*NDIMCD)/(MDIMAD)) and KDIMBD = (...)
+    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "MDIMAD"});
+    tuner.AddConstraint(id, MultipleOfXMulYDivZ, {"WGD", "MDIMCD", "NDIMCD", "NDIMBD"});
+
+    // Extra constraints for variation 1 to limit the set of options significantly
+    if (V==1) {
+      auto IsEqual = [] (std::vector<size_t> v) { return v[0] == v[1]; };
+      tuner.AddConstraint(id, IsEqual, {"MDIMCD", "MDIMAD"});
+      tuner.AddConstraint(id, IsEqual, {"NDIMCD", "NDIMBD"});
+    }
+  }
+
+  // Sets the local memory size
+  static void SetLocalMemorySize(cltune::Tuner &tuner, const size_t id, const Arguments<T> &args) {
+    auto LocalMemorySize = [args] (std::vector<size_t> v) {
+      return ((v[0]*(v[0] + v[1]) + v[0]*(v[0] + v[2]))*GetBytes(args.precision));
+    };
+    tuner.SetLocalMemoryUsage(id, LocalMemorySize, {"WGD", "PADA", "PADB"});
+  }
+
+  // Sets the base thread configuration
+  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
+  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
+  static std::vector<size_t> LocalSize() { return {1, 1}; }
+  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
+
+  // Transforms the thread configuration based on the parameters
+  using TransformVector = std::vector<std::vector<std::string>>;
+  static TransformVector MulLocal() { return {{"MDIMCD", "NDIMCD"}}; }
+  static TransformVector DivLocal() { return {}; }
+  static TransformVector MulGlobal() { return {{"MDIMCD", "NDIMCD"}}; }
+  static TransformVector DivGlobal() { return {{"WGD", "WGD"}}; }
+
+  // Sets the kernel's arguments
+  static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
+                           std::vector<T> &, std::vector<T> &,
+                           std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
+                           std::vector<T> &) {
+    tuner.AddArgumentScalar(static_cast<int>(args.m));
+    tuner.AddArgumentScalar(static_cast<int>(args.n));
+    tuner.AddArgumentScalar(static_cast<int>(args.k));
+    tuner.AddArgumentScalar(GetRealArg(args.alpha));
+    tuner.AddArgumentScalar(GetRealArg(args.beta));
+    tuner.AddArgumentInput(a_mat);
+    tuner.AddArgumentScalar(0); // a_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.k)); // a_ld
+    tuner.AddArgumentInput(b_mat);
+    tuner.AddArgumentScalar(0); // b_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.n)); // b_ld
+    tuner.AddArgumentOutput(c_mat);
+    tuner.AddArgumentScalar(0); // c_offset
+    tuner.AddArgumentScalar(static_cast<int>(args.n)); // c_ld
+    tuner.AddArgumentScalar(1); // c_do_transpose
+    tuner.AddArgumentScalar(0); // a_conjugate
+    tuner.AddArgumentScalar(0); // b_conjugate
+  }
+
+  // Describes how to compute the performance metrics
+  static size_t GetMetric(const Arguments<T> &args) {
+    return 2 * args.m * args.n * args.k;
+  }
+  static std::string PerformanceUnit() { return "GFLOPS"; }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Function to tune a specific variation V (not within the clblast namespace)
+template <int V>
+void StartVariation(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv)) {
+    case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemmDirect<half,V>, half>(argc, argv); break;
+    case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemmDirect<float,V>, float>(argc, argv); break;
+    case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemmDirect<double,V>, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemmDirect<float2,V>, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble: clblast::Tuner<clblast::TuneXgemmDirect<double2,V>, double2>(argc, argv); break;
+  }
+}
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  StartVariation<1>(argc, argv);
+  StartVariation<2>(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/src/tuning/kernels/xgemv.cpp b/src/tuning/kernels/xgemv.cpp
index 7229602d..f332f52a 100644
--- a/src/tuning/kernels/xgemv.cpp
+++ b/src/tuning/kernels/xgemv.cpp
@@ -50,6 +50,7 @@ class TuneXgemv {
   static size_t DefaultN() { return 2048; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.n; }
diff --git a/src/tuning/kernels/xger.cpp b/src/tuning/kernels/xger.cpp
index 1fb5c531..c3d0c7dd 100644
--- a/src/tuning/kernels/xger.cpp
+++ b/src/tuning/kernels/xger.cpp
@@ -47,6 +47,7 @@ class TuneXger {
   static size_t DefaultN() { return 1024; }
   static size_t DefaultK() { return 1; } // N/A for this kernel
   static double DefaultFraction() { return 1.0; } // N/A for this kernel
+  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
 
   // Describes how to obtain the sizes of the buffers
   static size_t GetSizeX(const Arguments<T> &args) { return args.m; }
diff --git a/src/tuning/tuning.hpp b/src/tuning/tuning.hpp
index 19df5f9a..afb092bc 100644
--- a/src/tuning/tuning.hpp
+++ b/src/tuning/tuning.hpp
@@ -30,6 +30,7 @@ namespace clblast {
 // that it is automatically compiled for the various kernels (given as the 'C' template argument).
 template <typename C, typename T>
 void Tuner(int argc, char* argv[]) {
+  constexpr auto kSeed = 42; // fixed seed for reproducibility
 
   // Sets the parameters and platform/device for which to tune (command-line options)
   auto help = std::string{"* Options given/available:\n"};
@@ -45,6 +46,8 @@ void Tuner(int argc, char* argv[]) {
     if (o == kArgBeta)     { args.beta     = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>()); }
     if (o == kArgFraction) { args.fraction = GetArgument(argc, argv, help, kArgFraction, C::DefaultFraction()); }
   }
+  const auto num_runs = GetArgument(argc, argv, help, kArgNumRuns, C::DefaultNumRuns());
+
   fprintf(stdout, "%s\n", help.c_str());
 
   // Tests validity of the given arguments
@@ -73,12 +76,12 @@ void Tuner(int argc, char* argv[]) {
   auto b_mat = std::vector<T>(C::GetSizeB(args));
   auto c_mat = std::vector<T>(C::GetSizeC(args));
   auto temp = std::vector<T>(C::GetSizeTemp(args));
-  PopulateVector(x_vec);
-  PopulateVector(y_vec);
-  PopulateVector(a_mat);
-  PopulateVector(b_mat);
-  PopulateVector(c_mat);
-  PopulateVector(temp);
+  PopulateVector(x_vec, kSeed);
+  PopulateVector(y_vec, kSeed);
+  PopulateVector(a_mat, kSeed);
+  PopulateVector(b_mat, kSeed);
+  PopulateVector(c_mat, kSeed);
+  PopulateVector(temp, kSeed);
 
   // Initializes the tuner for the chosen device
   cltune::Tuner tuner(args.platform_id, args.device_id);
@@ -126,6 +129,7 @@ void Tuner(int argc, char* argv[]) {
   C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);
 
   // Starts the tuning process
+  tuner.SetNumRuns(num_runs);
   tuner.Tune();
 
   // Prints the results to screen
@@ -134,7 +138,7 @@ void Tuner(int argc, char* argv[]) {
 
   // Also prints the performance of the best-case in terms of GB/s or GFLOPS
   if (time_ms != 0.0) {
-    printf("[ -------> ] %.1lf ms", time_ms);
+    printf("[ -------> ] %.2lf ms", time_ms);
     printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
   }
 
diff --git a/src/utilities.cpp b/src/utilities.cpp
index 77bc72d7..86cc2d13 100644
--- a/src/utilities.cpp
+++ b/src/utilities.cpp
@@ -270,40 +270,40 @@ unsigned int GetRandomSeed() {
 
 // Create a random number generator and populates a vector with samples from a random distribution
 template <typename T>
-void PopulateVector(std::vector<T> &vector) {
+void PopulateVector(std::vector<T> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<T>(kTestDataLowerLimit);
   auto upper_limit = static_cast<T>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<T> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = dist(mt); }
 }
-template void PopulateVector<float>(std::vector<float>&);
-template void PopulateVector<double>(std::vector<double>&);
+template void PopulateVector<float>(std::vector<float>&, const unsigned int);
+template void PopulateVector<double>(std::vector<double>&, const unsigned int);
 
 // Specialized versions of the above for complex data-types
 template <>
-void PopulateVector(std::vector<float2> &vector) {
+void PopulateVector(std::vector<float2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 template <>
-void PopulateVector(std::vector<double2> &vector) {
+void PopulateVector(std::vector<double2> &vector, const unsigned int seed) {
   auto lower_limit = static_cast<double>(kTestDataLowerLimit);
   auto upper_limit = static_cast<double>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<double> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element.real(dist(mt)); element.imag(dist(mt)); }
 }
 
 // Specialized versions of the above for half-precision
 template <>
-void PopulateVector(std::vector<half> &vector) {
+void PopulateVector(std::vector<half> &vector, const unsigned int seed) {
   const auto lower_limit = static_cast<float>(kTestDataLowerLimit);
   const auto upper_limit = static_cast<float>(kTestDataUpperLimit);
-  std::mt19937 mt(GetRandomSeed());
+  std::mt19937 mt(seed);
   std::uniform_real_distribution<float> dist(lower_limit, upper_limit);
   for (auto &element: vector) { element = FloatToHalf(dist(mt)); }
 }
diff --git a/src/utilities.hpp b/src/utilities.hpp
index 75bd5a69..038a8a96 100644
--- a/src/utilities.hpp
+++ b/src/utilities.hpp
@@ -25,6 +25,8 @@
 #include "clblast_half.h"
 #include "clpp11.hpp"
 
+#include "msvc.hpp"
+
 namespace clblast {
 // =================================================================================================
 
@@ -206,7 +208,7 @@ bool CheckArgument(const int argc, char *argv[], std::string &help, const std::s
 // =================================================================================================
 
 // Helper function to check for errors in the status code
-constexpr bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
+inline bool ErrorIn(const StatusCode s) { return (s != StatusCode::kSuccess); }
 
 // =================================================================================================
 
@@ -219,7 +221,7 @@ constexpr auto kTestDataUpperLimit = 2.0;
 
 // Populates a vector with random data
 template <typename T>
-void PopulateVector(std::vector<T> &vector);
+void PopulateVector(std::vector<T> &vector, const unsigned int seed);
 
 // =================================================================================================