Merge tag '1.5.2' into debian/sid

author: Gard Spreemann <gspr@nonempty.org> 2021-01-20 15:42:05 +0100
committer: Gard Spreemann <gspr@nonempty.org> 2021-01-20 15:42:05 +0100
commit: c8e17e202f2ac8ab338a1a444b6be37a97e38226 (patch)
tree: 9ecce4c2b90103725817782cc4412835ec95fac0
parent: b991afa34ae943548146e4f71bbbba8f1f4b1000 (diff)
parent: 70016e869881df837402def4904b2888247e02d9 (diff)
90 files changed, 1007 insertions, 71 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 8ff37320..0fc3ca55 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -57,8 +57,8 @@ build_script:
 
 after_build:
   - ps: pushd $env:CLBLAST_BUILD
-  - 7z a CLBlast-1.5.1-Windows-x64.zip .\install_dir\*
-  - ps: mv CLBlast-1.5.1-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+  - 7z a CLBlast-1.5.2-Windows-x64.zip .\install_dir\*
+  - ps: mv CLBlast-1.5.2-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
 
 artifacts:
   - path: '*.zip'
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000..be34db57
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: CNugteren
diff --git a/.travis.yml b/.travis.yml
index e6d7fd16..f937222d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ matrix:
 
 env:
   global:
-    - CLBLAST_VERSION=1.5.1
+    - CLBLAST_VERSION=1.5.2
     - CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
     - CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
     - CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
diff --git a/CHANGELOG b/CHANGELOG
index de932972..e3614d80 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,13 @@
+Version 1.5.2
+- Changed XAMAX/XAMIN to more likely return first rather than last min/max index, updated API docs
+- Added batched routines to pyclblast
+- Added CLBLAST_VERSION_MAJOR/MINOR/PATCH defines in headers to store version numbering
+- Several small improvements to the benchmark script (thanks to 'baryluk')
+- Fixed a bug in the caching when using a context with multiple devices
+- Fixed a bug in the tuners related to global workgroup size not being a multiple of the local
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see doc/tuning.md)
+
 Version 1.5.1
 - Implemented single-kernel version of convolution as GEMM
 - Now catches all exceptions thrown by the tuners
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09d93d88..2d280503 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ endif()
 project("clblast" C CXX)
 set(clblast_VERSION_MAJOR 1)
 set(clblast_VERSION_MINOR 5)
-set(clblast_VERSION_PATCH 1)
+set(clblast_VERSION_PATCH 2)
 set(clblast_VERSION "${clblast_VERSION_MAJOR}.${clblast_VERSION_MINOR}.${clblast_VERSION_PATCH}")
 set(clblast_SOVERSION ${clblast_VERSION_MAJOR})
 
@@ -228,7 +228,7 @@ set(PRECISIONS 32 64 3232 6464 16)
 # Sample programs
 if(OPENCL)
   set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api)
-  set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+  set(SAMPLE_PROGRAMS_C sasum samax dgemv sgemm haxpy cache)
   if(NETLIB)
     set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
   endif()
diff --git a/doc/api.md b/doc/api.md
index 996505f1..9d4bedfa 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -511,7 +511,7 @@ Arguments to SUM:
 xAMAX: Index of absolute maximum value in a vector
 -------------
 
-Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
+Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
 
 C++ API:
 ```
@@ -562,7 +562,7 @@ Arguments to AMAX:
 xAMIN: Index of absolute minimum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
+Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
 
 C++ API:
 ```
@@ -613,7 +613,7 @@ Arguments to AMIN:
 xMAX: Index of maximum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
+Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
 
 C++ API:
 ```
@@ -664,7 +664,7 @@ Arguments to MAX:
 xMIN: Index of minimum value in a vector (non-BLAS function)
 -------------
 
-Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
+Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
 
 C++ API:
 ```
diff --git a/doc/tuning.md b/doc/tuning.md
index 20af9bd5..793bce88 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -30,6 +30,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
   - GeForce GTX TITAN Black
   - GeForce GTX TITAN X
   - TITAN X (Pascal)
+  - TITAN RTX
   - Tesla K20m
   - Tesla K40m
   - Tesla P100 16GB
@@ -44,6 +45,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
   - Radeon RX 480
   - Radeon R9 Fury X
   - Radeon Pro 580
+  - Radeon RX Vega
 * Intel GPUs:
   - HD Graphics 530
   - HD Graphics 5500 BroadWell U-Processor GT2
diff --git a/include/clblast.h b/include/clblast.h
index 7a82361c..296f3987 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -39,6 +39,11 @@
   #define PUBLIC_API
 #endif
 
+// Version numbering (v1.5.1)
+#define CLBLAST_VERSION_MAJOR 1
+#define CLBLAST_VERSION_MINOR 5
+#define CLBLAST_VERSION_PATCH 1
+
 namespace clblast {
 // =================================================================================================
 
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 2ba6375a..7f7b1e97 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -34,6 +34,11 @@
   #define PUBLIC_API
 #endif
 
+// Version numbering (v1.5.1)
+#define CLBLAST_VERSION_MAJOR 1
+#define CLBLAST_VERSION_MINOR 5
+#define CLBLAST_VERSION_PATCH 1
+
 // The C interface
 #ifdef __cplusplus
 extern "C" {
diff --git a/samples/samax.c b/samples/samax.c
new file mode 100644
index 00000000..36e78846
--- /dev/null
+++ b/samples/samax.c
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the iSAMAX routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine iSAMAX
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example iSAMAX arguments
+  const size_t n = 1000;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  float* host_input = (float*)malloc(sizeof(float)*n);
+  unsigned int* host_output = (unsigned int*)malloc(sizeof(unsigned int)*1);
+  for (size_t i=0; i<n; ++i) { host_input[i] = (float)(i % 10); } // staircase modulo 10
+  for (size_t i=0; i<1; ++i) { host_output[i] = 77; }  // some temp value to be overwritten later
+
+  // Copy the data-structures to the device
+  cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+  cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(unsigned int), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL);
+
+  // Call the iSAMAX routine.
+  CLBlastStatusCode status = CLBlastiSamax(n,
+                                           device_output, 0,
+                                           device_input, 0, 1,
+                                           &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Copies the result back to the host
+  clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed iSAMAX with status %d: array of %d values with staircases from 0..9 repeated, max at index %zu with value %.0lf\n",
+         status, n, host_output[0], host_input[host_output[0]]);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_input);
+  free(host_output);
+  clReleaseMemObject(device_input);
+  clReleaseMemObject(device_output);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py
index 0bb37c10..81f1ec90 100644
--- a/scripts/benchmark/benchmark.py
+++ b/scripts/benchmark/benchmark.py
@@ -63,7 +63,7 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device, c
             all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments]
             benchmark_output = utils.run_binary(binary, all_arguments)
             result_extra = utils.parse_results(benchmark_output)
-            for index in range(len(min(result, result_extra))):
+            for index in range(min(len(result), len(result_extra))):
                 result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
                 result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
                 for id in COMPARISON_IDS:
@@ -83,7 +83,7 @@ def parse_arguments(argv):
     parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
     parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
     parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
-    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+    parser.add_argument("-l", "--load_from_disk", action="store_true", help="Loading existing results from JSON file and replot")
     parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name")
     parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation")
     parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
@@ -103,7 +103,7 @@ def benchmark_single(benchmark, comparisons, platform, device, num_runs, precisi
     # The benchmark name and plot title
     benchmark_name = utils.precision_to_letter(precision) + benchmark.upper()
     if benchmark.upper() != "SUMMARY":
-        plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
+        plot_title = benchmark_name if plot_title == "" else benchmark_name + ": " + plot_title
 
     # Retrieves the comparison settings
     library_ids = [1]
diff --git a/scripts/benchmark/plot.py b/scripts/benchmark/plot.py
index 6337b78f..b0b63df3 100644
--- a/scripts/benchmark/plot.py
+++ b/scripts/benchmark/plot.py
@@ -10,6 +10,7 @@ import matplotlib
 matplotlib.use('Agg')
 from matplotlib import rcParams
 import matplotlib.pyplot as plt
+import numpy as np
 
 # Colors
 BLUEISH = [c / 255.0 for c in [71, 101, 177]]  # #4765b1
@@ -24,7 +25,7 @@ def plot_graphs(results, file_name, num_rows, num_cols,
                 x_keys, y_keys, titles, x_labels, y_labels,
                 label_names, title, tight_plot, verbose):
     assert len(results) == num_rows * num_cols
-    assert len(results) != 1
+    assert len(results) >= 1
     assert len(x_keys) == len(results)
     assert len(y_keys) == len(results)
     assert len(titles) == len(results)
@@ -64,6 +65,9 @@ def plot_graphs(results, file_name, num_rows, num_cols,
     size_y = plot_size * num_rows
     rcParams.update({'font.size': font_size})
     fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k')
+    if len(results) == 1 and not type(axes) is np.ndarray:
+        axes = np.full((1,1), axes)
+    assert type(axes) is np.ndarray
     fig.text(.5, 0.92, title, horizontalalignment="center", fontsize=font_size_title)
     plt.subplots_adjust(wspace=w_space, hspace=h_space)
 
@@ -72,7 +76,7 @@ def plot_graphs(results, file_name, num_rows, num_cols,
         for col in range(num_cols):
             index = row * num_cols + col
             result = results[index]
-            ax = axes.flat[index]
+            ax = axes[row, col]
             plt.sca(ax)
             print("[plot] Plotting subplot %d" % index)
 
diff --git a/scripts/benchmark/utils.py b/scripts/benchmark/utils.py
index 11aad805..fe0bd7f7 100644
--- a/scripts/benchmark/utils.py
+++ b/scripts/benchmark/utils.py
@@ -50,7 +50,8 @@ def run_binary(command, arguments):
     full_command = command + " " + " ".join(arguments)
     print("[benchmark] Calling binary: %s" % str(full_command))
     try:
-        return subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+        result = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+        return result.decode("ascii")
     except OSError as e:
         print("[benchmark] Error while running the binary, got exception: %s" + str(e))
         return False
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 76c5dc1c..875b1a55 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -49,7 +49,7 @@ FILES = [
     "/src/clblast_cuda.cpp",
     "/src/pyclblast/src/pyclblast.pyx"
 ]
-HEADER_LINES = [124, 21, 128, 24, 29, 45, 29, 66, 40, 96, 21, 327]
+HEADER_LINES = [129, 21, 133, 24, 29, 45, 29, 66, 40, 96, 21, 327]
 FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37]
 HEADER_LINES_DOC = 0
 FOOTER_LINES_DOC = 232
@@ -132,10 +132,10 @@ ROUTINES = [
   Routine(True,  True,  0, False, "1", "nrm2",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["nrm2"],                     [xn,"1"],      [],           "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
   Routine(True,  True,  0, False, "1", "asum",  T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["asum"],                     [xn,"1"],      [],           "n",   "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
   Routine(True,  False, 0, False, "1", "sum",   T, [S,D,Sc,Dz,H],    ["n"],               [],                                                     ["x"],      ["sum"],                      [xn,"1"],      [],           "n",   "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
-  Routine(True,  True,  0, False, "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
-  Routine(True,  False, 0, False, "1", "amin",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []),
-  Routine(True,  False, 0, False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
-  Routine(True,  False, 0, False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+  Routine(True,  True,  0, False, "1", "amax",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of absolute maximum value in a vector", "Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+  Routine(True,  False, 0, False, "1", "amin",  T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []),
+  Routine(True,  False, 0, False, "1", "max",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imax"],                     [xn,"1"],      [],           "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+  Routine(True,  False, 0, False, "1", "min",   T, [iS,iD,iC,iZ,iH], ["n"],               [],                                                     ["x"],      ["imin"],                     [xn,"1"],      [],           "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
 ],
 [  # Level 2: matrix-vector
   Routine(True,  True,  0, False, "2a", "gemv",  T,  [S,D,C,Z,H],    ["m","n"],           ["layout","a_transpose"],                               ["a","x"],  ["y"],                        [amn,xmn,ynm], ["alpha","beta"], "",    "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
diff --git a/scripts/generator/generator/pyclblast.py b/scripts/generator/generator/pyclblast.py
index 47eb2eb4..b7ec348e 100644
--- a/scripts/generator/generator/pyclblast.py
+++ b/scripts/generator/generator/pyclblast.py
@@ -22,6 +22,16 @@ def to_np_dtype(flavour):
     }[flavour.precision_name]
 
 
+def cl_type(flavour):
+    return {
+        "S": "cl_float",
+        "D": "cl_double",
+        "C": "cl_float2",
+        "Z": "cl_double2",
+        "H": "cl_half",
+    }[flavour.precision_name]
+
+
 def scalar_cython_conversion(scalar, flavour):
     scalar_type = flavour.alpha_cl if scalar == "alpha" else flavour.beta_cl
     if scalar_type == "float":
@@ -39,7 +49,9 @@ def scalar_cython_conversion(scalar, flavour):
 
 def generate_pyx(routine):
     result = ""
-    if routine.implemented and routine.plain_name() and routine.level in ["1", "2a", "2b", "3"]:
+    if routine.implemented and routine.plain_name() and routine.level in ["1", "2a", "2b", "3", "x"]:
+        if routine.level == "x" and routine.batched == 0:
+            return result # level-X routines that are non-batched are not supported at the moment
         indent = "    "
 
         result += SEPARATOR + NL
@@ -80,6 +92,33 @@ def generate_pyx(routine):
             result += buf + ", \"" + buf + "\")" + NL
         result += NL
 
+        # Batched checks
+        if routine.batched == 1:  # batched but not strided-batched
+            lists = [b + "_offsets" for b in buffers] + [s + "s" for s in routine.scalars]
+            result += indent + "if " + " != ".join(["len(" + l + ")" for l in lists]) + ":" + NL
+            result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: length of batch-sized arguments " + ", ".join(lists) + " should be equal\")" + NL
+            result += indent + "batch_count = len(" + lists[0] + ")" + NL
+            result += NL
+
+            # Batched list to pointer conversions
+            for buf in buffers:
+                result += indent + "cdef size_t *" + buf + "_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))" + NL
+                result += indent + "for i in range(batch_count):" + NL
+                result += indent + indent + "" + buf + "_offsets_c[i] = " + buf + "_offsets[i]" + NL
+            for scalar in routine.scalars:
+                result += indent + "cdef void *" + scalar + "s_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))" + NL
+                result += indent + "for i in range(batch_count):" + NL
+                if_prefix = ""
+                for flavour in routine.flavours:
+                    if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
+                        np_dtype = to_np_dtype(flavour)
+                        result += indent + indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL
+                        scalar_converted = scalar_cython_conversion(scalar + "s[i]", flavour)
+                        result += indent + indent + indent + "(<" + cl_type(flavour) + "*>" + scalar + "s_c)[i] = " + scalar_converted + NL
+                        if_prefix = "el"
+
+            result += NL
+
         # Buffer transformation
         for buf in buffers:
             result += indent + "cdef cl_mem " + buf + "_buffer = <cl_mem><size_t>" + buf + ".base_data.int_ptr" + NL
@@ -108,11 +147,22 @@ def generate_pyx(routine):
         for flavour in routine.flavours:
             if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
                 np_dtype = to_np_dtype(flavour)
-                argument_names = [x.
-                                  replace("layout", "CLBlastLayoutRowMajor").
-                                  replace("alpha", scalar_cython_conversion("alpha", flavour)).
-                                  replace("beta", scalar_cython_conversion("beta", flavour))
-                                  for x in routine.arguments()]
+                if routine.batched != 1:  # regular or strided-batched
+                    argument_names = [x.
+                                      replace("layout", "CLBlastLayoutRowMajor").
+                                      replace("alpha", scalar_cython_conversion("alpha", flavour)).
+                                      replace("beta", scalar_cython_conversion("beta", flavour))
+                                      for x in routine.arguments()]
+                else:  # batched but not strided-batched
+                    argument_names = [x.
+                                      replace("layout", "CLBlastLayoutRowMajor").
+                                      replace("_cpp", "_c").
+                                      replace("_offsets", "_offsets_c").
+                                      replace("alphas_c", "<" + cl_type(flavour) + "*>alphas_c").
+                                      replace("betas_c", "<" + cl_type(flavour) + "*>betas_c")
+                                      for x in routine.arguments()]
+                if routine.batched > 0:
+                    argument_names.append("batch_count")
                 result += indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL
                 result += indent + indent + "err = CLBlast" + flavour.name + routine.plain_name()
                 result += "(" + ", ".join(argument_names) + ", &command_queue, &event)" + NL
@@ -120,6 +170,14 @@ def generate_pyx(routine):
 
         result += indent + "else:" + NL
         result += indent + indent + "raise ValueError(\"PyCLBlast: Unrecognized data-type '%s'\" % dtype)" + NL
+        result += NL
+
+        # Cleaning up
+        if routine.batched == 1:  # batched but not strided-batched
+            for array in [b + "_offset" for b in buffers] + routine.scalars:
+                result += indent + "PyMem_Free(" + array + "s_c)" + NL
+            result += NL
+
         result += indent + "if err != CLBlastSuccess:" + NL
         result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: %s\" % get_status_message(err))" + NL
         result += indent + "return cl.Event.from_int_ptr(<size_t>event)" + NL
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 3b5a6b76..8b6ab57f 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -825,17 +825,37 @@ class Routine:
         """Arguments for the Python wrapper pyclblast"""
         result = list()
         result.extend(self.sizes)
+        if self.batched == 2:  # strided batched
+            result.append("batch_count")
         buffers = self.inputs + self.outputs
         result.extend(buffers[:])
-        for buf in buffers:
-            if buf in self.buffers_matrix():
-                result.append(buf + "_ld")
-        for buf in buffers:
-            if buf in self.buffers_vector():
-                result.append(buf + "_inc = 1")
-        for scalar in self.scalars:
-            default = "1.0" if scalar == "alpha" else "0.0"
-            result.append(scalar + " = " + default)
+        if self.batched != 1:  # regular or strided-batched
+            for buf in buffers:
+                if buf in self.buffers_matrix():
+                    result.append(buf + "_ld")
+            for buf in buffers:
+                if buf in self.buffers_vector():
+                    result.append(buf + "_inc = 1")
+            if self.batched == 2:  # strided batched
+                for buf in buffers:
+                    if buf in self.buffers_matrix():
+                        result.append(buf + "_stride")
+            for scalar in self.scalars:
+                if scalar != "":
+                    default = "1.0" if scalar == "alpha" else "0.0"
+                    result.append(scalar + " = " + default)
+        else:  # batched but not strided-batched
+            for scalar in self.scalars:
+                result.append(scalar + "s")
+            for buf in buffers:
+                if buf in self.buffers_matrix():
+                    result.append(buf + "_ld")
+            for buf in buffers:
+                if buf in self.buffers_vector() + self.buffers_matrix():
+                    result.append(buf + "_offsets")
+            for buf in buffers:
+                if buf in self.buffers_vector():
+                    result.append(buf + "_inc = 1")
         for option in self.options:
             if option == "a_transpose":
                 result.append("a_transp = False")
@@ -849,8 +869,9 @@ class Routine:
                 result.append("lower_triangle = False")
             if option == "diagonal":
                 result.append("unit_diagonal = False")
-        for buf in buffers:
-            result.append(buf + "_offset = 0")
+        if self.batched != 1:
+            for buf in buffers:
+                result.append(buf + "_offset = 0")
         return result
 
     def requirements_doc(self):
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index aeb14989..2a25606c 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -47,6 +47,7 @@
 #include <assert.h>
 
 // OpenCL
+#define CL_TARGET_OPENCL_VERSION 110
 #define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
 #define CL_USE_DEPRECATED_OPENCL_2_0_APIS // to disable deprecation warnings
@@ -508,12 +509,35 @@ class Program {
 
   // Retrieves a binary or an intermediate representation of the compiled program
   std::string GetIR() const {
-    auto bytes = size_t{0};
-    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
+    cl_uint num_devices = 0;
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_NUM_DEVICES,
+                sizeof(cl_uint), &num_devices, nullptr));
+
+    std::vector<size_t> binSizesInBytes(num_devices, 0);
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES,
+                num_devices * sizeof(size_t), binSizesInBytes.data(), nullptr));
+
+    auto bytes       = size_t{0};
+    auto binSizeIter = size_t{0};
+    // Loop over the program binary sizes to find a binary whose size is > 0.
+    // The current logic assumes that there ever is only one valid program binary
+    // in a given cl_program. This should be the case unless the cl_program
+    // is built for all or a subset of devices associated to a given cl_program
+    for (; binSizeIter < binSizesInBytes.size(); ++binSizeIter) {
+        if (binSizesInBytes[binSizeIter] > 0) {
+            bytes = binSizesInBytes[binSizeIter];
+            break;
+        }
+    }
     auto result = std::string{};
     result.resize(bytes);
-    auto result_ptr = result.data();
-    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
+
+    std::vector<char*> out(num_devices, nullptr);
+    out[binSizeIter] = const_cast<char*>(result.data());
+
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
+                                num_devices * sizeof(char*),
+                                out.data(), nullptr));
     return result;
   }
 
diff --git a/src/database/database_structure.hpp b/src/database/database_structure.hpp
index 0199ec3a..ec6a16d2 100644
--- a/src/database/database_structure.hpp
+++ b/src/database/database_structure.hpp
@@ -21,6 +21,7 @@
 
 // Just needed for 'Precision'
 #ifdef OPENCL_API
+  #define CL_TARGET_OPENCL_VERSION 110
   #include "clblast.h"
 #elif CUDA_API
   #include "clblast_cuda.h"
diff --git a/src/database/kernels/copy/copy_16.hpp b/src/database/kernels/copy/copy_16.hpp
index 564bea6b..a13072f0 100644
--- a/src/database/kernels/copy/copy_16.hpp
+++ b/src/database/kernels/copy/copy_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry CopyHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/copy/copy_32.hpp b/src/database/kernels/copy/copy_32.hpp
index 3ef38ccb..9f24afea 100644
--- a/src/database/kernels/copy/copy_32.hpp
+++ b/src/database/kernels/copy/copy_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry CopySingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -87,6 +91,7 @@ const DatabaseEntry CopySingle = {
     { // Intel GPUs
       kDeviceTypeGPU, "Intel", {
         { "default", {
+          { Name{"Intel(R) Gen9 HD Graphics NEO                     "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { Name{"Intel(R) HD Graphics 530                          "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -154,6 +159,10 @@ const DatabaseEntry CopySingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/copy/copy_3232.hpp b/src/database/kernels/copy/copy_3232.hpp
index 81feba9f..08ee3c25 100644
--- a/src/database/kernels/copy/copy_3232.hpp
+++ b/src/database/kernels/copy/copy_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry CopyComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -149,6 +153,10 @@ const DatabaseEntry CopyComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/copy/copy_64.hpp b/src/database/kernels/copy/copy_64.hpp
index 52ce191e..f6ef641e 100644
--- a/src/database/kernels/copy/copy_64.hpp
+++ b/src/database/kernels/copy/copy_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry CopyDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,8 +140,12 @@ const DatabaseEntry CopyDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/copy/copy_6464.hpp b/src/database/kernels/copy/copy_6464.hpp
index d0e31201..8607efc5 100644
--- a/src/database/kernels/copy/copy_6464.hpp
+++ b/src/database/kernels/copy/copy_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry CopyComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry CopyComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
index 97b9c5ed..68e18f01 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -61,6 +61,10 @@ const DatabaseEntry GemmRoutineSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
index c3e60fe5..3f5e32bc 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -48,15 +48,19 @@ const DatabaseEntry GemmRoutineComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
index bd3df89a..036da90f 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_64.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
@@ -39,15 +39,19 @@ const DatabaseEntry GemmRoutineDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
     { // Default
       kDeviceTypeAll, "default", {
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
index 044fa5d8..facfd8c8 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
@@ -39,6 +39,10 @@ const DatabaseEntry GemmRoutineComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/invert/invert_32.hpp b/src/database/kernels/invert/invert_32.hpp
index aaaec3e5..aeb8a6d4 100644
--- a/src/database/kernels/invert/invert_32.hpp
+++ b/src/database/kernels/invert/invert_32.hpp
@@ -50,6 +50,10 @@ const DatabaseEntry InvertSingle = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/invert/invert_3232.hpp b/src/database/kernels/invert/invert_3232.hpp
index 5666e9a2..575b104a 100644
--- a/src/database/kernels/invert/invert_3232.hpp
+++ b/src/database/kernels/invert/invert_3232.hpp
@@ -49,6 +49,10 @@ const DatabaseEntry InvertComplexSingle = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/invert/invert_64.hpp b/src/database/kernels/invert/invert_64.hpp
index 2cd5e369..96a5223a 100644
--- a/src/database/kernels/invert/invert_64.hpp
+++ b/src/database/kernels/invert/invert_64.hpp
@@ -40,6 +40,10 @@ const DatabaseEntry InvertDouble = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/invert/invert_6464.hpp b/src/database/kernels/invert/invert_6464.hpp
index 4e518e81..ebacda6c 100644
--- a/src/database/kernels/invert/invert_6464.hpp
+++ b/src/database/kernels/invert/invert_6464.hpp
@@ -40,6 +40,10 @@ const DatabaseEntry InvertComplexDouble = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/pad/pad_16.hpp b/src/database/kernels/pad/pad_16.hpp
index ca5acf75..17367a44 100644
--- a/src/database/kernels/pad/pad_16.hpp
+++ b/src/database/kernels/pad/pad_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry PadHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/pad/pad_32.hpp b/src/database/kernels/pad/pad_32.hpp
index 7e554e31..52529d1e 100644
--- a/src/database/kernels/pad/pad_32.hpp
+++ b/src/database/kernels/pad/pad_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -154,6 +158,10 @@ const DatabaseEntry PadSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp
index cea79c14..cfe5c632 100644
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -151,6 +155,10 @@ const DatabaseEntry PadComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/pad/pad_64.hpp b/src/database/kernels/pad/pad_64.hpp
index e85dcb17..b398eca9 100644
--- a/src/database/kernels/pad/pad_64.hpp
+++ b/src/database/kernels/pad/pad_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,8 +140,12 @@ const DatabaseEntry PadDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/pad/pad_6464.hpp b/src/database/kernels/pad/pad_6464.hpp
index 52a4d0e0..b3898574 100644
--- a/src/database/kernels/pad/pad_6464.hpp
+++ b/src/database/kernels/pad/pad_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry PadComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/padtranspose/padtranspose_16.hpp b/src/database/kernels/padtranspose/padtranspose_16.hpp
index 4a4f619c..13979178 100644
--- a/src/database/kernels/padtranspose/padtranspose_16.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry PadtransposeHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/padtranspose/padtranspose_32.hpp b/src/database/kernels/padtranspose/padtranspose_32.hpp
index f7adc03c..fb09bc16 100644
--- a/src/database/kernels/padtranspose/padtranspose_32.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadtransposeSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -153,6 +157,10 @@ const DatabaseEntry PadtransposeSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp
index e37e385a..5aaa946d 100644
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadtransposeComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -151,6 +155,10 @@ const DatabaseEntry PadtransposeComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/padtranspose/padtranspose_64.hpp b/src/database/kernels/padtranspose/padtranspose_64.hpp
index 77328adb..6ac155d9 100644
--- a/src/database/kernels/padtranspose/padtranspose_64.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadtransposeDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry PadtransposeDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/padtranspose/padtranspose_6464.hpp b/src/database/kernels/padtranspose/padtranspose_6464.hpp
index 8330bffb..8ad7337f 100644
--- a/src/database/kernels/padtranspose/padtranspose_6464.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadtransposeComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry PadtransposeComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/transpose/transpose_16.hpp b/src/database/kernels/transpose/transpose_16.hpp
index a83e81cc..be06265c 100644
--- a/src/database/kernels/transpose/transpose_16.hpp
+++ b/src/database/kernels/transpose/transpose_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry TransposeHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/transpose/transpose_32.hpp b/src/database/kernels/transpose/transpose_32.hpp
index 7e9dec8c..5aca3154 100644
--- a/src/database/kernels/transpose/transpose_32.hpp
+++ b/src/database/kernels/transpose/transpose_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry TransposeSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -153,6 +157,10 @@ const DatabaseEntry TransposeSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp
index f4990954..b7341b88 100644
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry TransposeComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -143,6 +147,10 @@ const DatabaseEntry TransposeComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/transpose/transpose_64.hpp b/src/database/kernels/transpose/transpose_64.hpp
index 9f205565..3e73f07b 100644
--- a/src/database/kernels/transpose/transpose_64.hpp
+++ b/src/database/kernels/transpose/transpose_64.hpp
@@ -43,9 +43,13 @@ const DatabaseEntry TransposeDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
@@ -136,6 +140,10 @@ const DatabaseEntry TransposeDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/transpose/transpose_6464.hpp b/src/database/kernels/transpose/transpose_6464.hpp
index 2029c4fb..d16d9534 100644
--- a/src/database/kernels/transpose/transpose_6464.hpp
+++ b/src/database/kernels/transpose/transpose_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry TransposeComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -127,6 +131,10 @@ const DatabaseEntry TransposeComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_32.hpp b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
index 7d9b9372..e6184116 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
@@ -42,6 +42,10 @@ const DatabaseEntry TrsvRoutineSingle = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
index 6ec322fa..e479f235 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
@@ -41,6 +41,10 @@ const DatabaseEntry TrsvRoutineComplexSingle = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_64.hpp b/src/database/kernels/trsv_routine/trsv_routine_64.hpp
index db65f847..d5c66625 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_64.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_64.hpp
@@ -32,6 +32,10 @@ const DatabaseEntry TrsvRoutineDouble = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_6464.hpp b/src/database/kernels/trsv_routine/trsv_routine_6464.hpp
index 4a98b0a7..db52f414 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_6464.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_6464.hpp
@@ -32,6 +32,10 @@ const DatabaseEntry TrsvRoutineComplexDouble = {
           { Name{"GeForce GTX 1070 Ti                               "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xaxpy/xaxpy_16.hpp b/src/database/kernels/xaxpy/xaxpy_16.hpp
index ac243f05..a7548c9f 100644
--- a/src/database/kernels/xaxpy/xaxpy_16.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry XaxpyHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xaxpy/xaxpy_32.hpp b/src/database/kernels/xaxpy/xaxpy_32.hpp
index 92ea904d..c2eba834 100644
--- a/src/database/kernels/xaxpy/xaxpy_32.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XaxpySingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -154,8 +158,12 @@ const DatabaseEntry XaxpySingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp
index a1c69b83..8f686b3e 100644
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@@ -51,9 +51,13 @@ const DatabaseEntry XaxpyComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
-          { kDeviceNameDefault                                        , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
@@ -151,6 +155,10 @@ const DatabaseEntry XaxpyComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xaxpy/xaxpy_64.hpp b/src/database/kernels/xaxpy/xaxpy_64.hpp
index 5cc36a65..4494cf05 100644
--- a/src/database/kernels/xaxpy/xaxpy_64.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XaxpyDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry XaxpyDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xaxpy/xaxpy_6464.hpp b/src/database/kernels/xaxpy/xaxpy_6464.hpp
index 07bac80b..aae465cf 100644
--- a/src/database/kernels/xaxpy/xaxpy_6464.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XaxpyComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry XaxpyComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xdot/xdot_16.hpp b/src/database/kernels/xdot/xdot_16.hpp
index 4d1b96d3..f78c1606 100644
--- a/src/database/kernels/xdot/xdot_16.hpp
+++ b/src/database/kernels/xdot/xdot_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry XdotHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp
index 560e51ca..47c57f12 100644
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@@ -47,6 +47,10 @@ const DatabaseEntry XdotSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -137,6 +141,10 @@ const DatabaseEntry XdotSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp
index 63d5f75e..da9275bf 100644
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@@ -47,6 +47,10 @@ const DatabaseEntry XdotComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -134,6 +138,10 @@ const DatabaseEntry XdotComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xdot/xdot_64.hpp b/src/database/kernels/xdot/xdot_64.hpp
index 1a1033ee..240c99fc 100644
--- a/src/database/kernels/xdot/xdot_64.hpp
+++ b/src/database/kernels/xdot/xdot_64.hpp
@@ -39,6 +39,10 @@ const DatabaseEntry XdotDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -120,6 +124,10 @@ const DatabaseEntry XdotDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xdot/xdot_6464.hpp b/src/database/kernels/xdot/xdot_6464.hpp
index 4804a164..a32844ef 100644
--- a/src/database/kernels/xdot/xdot_6464.hpp
+++ b/src/database/kernels/xdot/xdot_6464.hpp
@@ -39,6 +39,10 @@ const DatabaseEntry XdotComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -120,6 +124,10 @@ const DatabaseEntry XdotComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp
index de4a044c..4805cdb4 100644
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XgemmSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } },
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } },
+          { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
@@ -154,6 +158,10 @@ const DatabaseEntry XgemmSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } },
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } },
+          { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 8, 8, 32, 32, 32, 64, 1, 1, 0, 0, 4, 2 } },
         } },
diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp
index d55aae56..29ab455c 100644
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@@ -151,6 +151,10 @@ const DatabaseEntry XgemmComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } },
+          { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 8, 8, 16, 32, 32, 64, 1, 1, 0, 0, 2, 1 } },
         } },
diff --git a/src/database/kernels/xgemm/xgemm_64.hpp b/src/database/kernels/xgemm/xgemm_64.hpp
index 2bc811d5..2549ea38 100644
--- a/src/database/kernels/xgemm/xgemm_64.hpp
+++ b/src/database/kernels/xgemm/xgemm_64.hpp
@@ -136,6 +136,10 @@ const DatabaseEntry XgemmDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 0, 1, 32, 2, 32, 32, 32, 16, 16, 32, 0, 0, 0, 0, 1, 2 } },
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } },
+          { kDeviceNameDefault                                        , Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 2 } },
         } },
diff --git a/src/database/kernels/xgemm/xgemm_6464.hpp b/src/database/kernels/xgemm/xgemm_6464.hpp
index a3314f93..5f88e319 100644
--- a/src/database/kernels/xgemm/xgemm_6464.hpp
+++ b/src/database/kernels/xgemm/xgemm_6464.hpp
@@ -135,6 +135,10 @@ const DatabaseEntry XgemmComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 32, 32, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
+          { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } },
         } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
index 0ca997fa..9168ce4d 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@@ -113,6 +113,10 @@ const DatabaseEntry XgemmDirectSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 8, 8, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
index fbd04af8..9b2ed40a 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@@ -106,6 +106,10 @@ const DatabaseEntry XgemmDirectComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
index d589f916..62206ec0 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
@@ -99,6 +99,10 @@ const DatabaseEntry XgemmDirectDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
index e183985a..da0710be 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
@@ -99,6 +99,10 @@ const DatabaseEntry XgemmDirectComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv/xgemv_32.hpp b/src/database/kernels/xgemv/xgemv_32.hpp
index fca974ec..7f788521 100644
--- a/src/database/kernels/xgemv/xgemv_32.hpp
+++ b/src/database/kernels/xgemv/xgemv_32.hpp
@@ -152,6 +152,10 @@ const DatabaseEntry XgemvSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp
index a09524de..2025867f 100644
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@@ -145,6 +145,10 @@ const DatabaseEntry XgemvComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv/xgemv_64.hpp b/src/database/kernels/xgemv/xgemv_64.hpp
index b5a86a5f..71a6878f 100644
--- a/src/database/kernels/xgemv/xgemv_64.hpp
+++ b/src/database/kernels/xgemv/xgemv_64.hpp
@@ -127,8 +127,12 @@ const DatabaseEntry XgemvDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xgemv/xgemv_6464.hpp b/src/database/kernels/xgemv/xgemv_6464.hpp
index d05efd2e..b380e440 100644
--- a/src/database/kernels/xgemv/xgemv_6464.hpp
+++ b/src/database/kernels/xgemv/xgemv_6464.hpp
@@ -108,8 +108,12 @@ const DatabaseEntry XgemvComplexDouble = {
           { Name{"GeForce GTX 1080 Ti                               "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
index 6271eab7..45cc0bed 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
@@ -152,6 +152,10 @@ const DatabaseEntry XgemvFastSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
index bfe475cb..a440f463 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
@@ -128,6 +128,10 @@ const DatabaseEntry XgemvFastComplexSingle = {
           { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
index 0383bca3..c5470eaf 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
@@ -127,8 +127,12 @@ const DatabaseEntry XgemvFastDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
index 005eccb3..3b0a21aa 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
@@ -104,6 +104,10 @@ const DatabaseEntry XgemvFastComplexDouble = {
           { Name{"GeForce GTX 1080 Ti                               "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
index 212e9c2f..7b7b7a27 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@@ -115,6 +115,10 @@ const DatabaseEntry XgemvFastRotSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
index 73bf5472..c495b2a8 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@@ -96,6 +96,10 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
           { Name{"GeForce GTX 1080 Ti                               "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
index d372ad90..6a10f4b6 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
@@ -92,6 +92,10 @@ const DatabaseEntry XgemvFastRotDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
index 3ad4170e..a7e3acc3 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
@@ -76,8 +76,12 @@ const DatabaseEntry XgemvFastRotComplexDouble = {
           { Name{"GeForce GTX 1080 Ti                               "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xger/xger_16.hpp b/src/database/kernels/xger/xger_16.hpp
index 8c86df38..8b08a401 100644
--- a/src/database/kernels/xger/xger_16.hpp
+++ b/src/database/kernels/xger/xger_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry XgerHalf = {
           { Name{"AMD Radeon RX 480                                 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
@@ -52,7 +56,7 @@ const DatabaseEntry XgerHalf = {
     { // Default
       kDeviceTypeAll, "default", {
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xger/xger_32.hpp b/src/database/kernels/xger/xger_32.hpp
index 7455fdba..0b99d911 100644
--- a/src/database/kernels/xger/xger_32.hpp
+++ b/src/database/kernels/xger/xger_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XgerSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -140,8 +144,12 @@ const DatabaseEntry XgerSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
-          { kDeviceNameDefault                                        , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
       }
     },
diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp
index af0431a7..7f547e1e 100644
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XgerComplexSingle = {
           { Name{"ATI Radeon HD 6750M                               "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -137,6 +141,10 @@ const DatabaseEntry XgerComplexSingle = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xger/xger_64.hpp b/src/database/kernels/xger/xger_64.hpp
index f531dd5a..857efaa3 100644
--- a/src/database/kernels/xger/xger_64.hpp
+++ b/src/database/kernels/xger/xger_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XgerDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -123,6 +127,10 @@ const DatabaseEntry XgerDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/database/kernels/xger/xger_6464.hpp b/src/database/kernels/xger/xger_6464.hpp
index b58867f1..bfeb114e 100644
--- a/src/database/kernels/xger/xger_6464.hpp
+++ b/src/database/kernels/xger/xger_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XgerComplexDouble = {
           { Name{"AMD Radeon R9 380                                 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "Vega", {
+          { Name{"Radeon RX Vega                                    "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { Name{"AMD Radeon Pro 580 Compute Engine                 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -123,6 +127,10 @@ const DatabaseEntry XgerComplexDouble = {
           { Name{"TITAN X (Pascal)                                  "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
           { kDeviceNameDefault                                        , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
+        { "SM7.5", {
+          { Name{"TITAN RTX                                         "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+          { kDeviceNameDefault                                        , Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+        } },
         { "default", {
           { kDeviceNameDefault                                        , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
         } },
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index e65acf17..85cbdc86 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -64,7 +64,7 @@ void Xamax(const int n,
     #else
       x = fabs(x);
     #endif
-    if (x >= max) {
+    if (x > max) {
       max = x;
       imax = id*x_inc + x_offset;
     }
@@ -77,7 +77,7 @@ void Xamax(const int n,
   // Performs reduction in local memory
   for (int s=WGS1/2; s>0; s=s>>1) {
     if (lid < s) {
-      if (maxlm[lid + s] >= maxlm[lid]) {
+      if (maxlm[lid + s] > maxlm[lid]) {
         maxlm[lid] = maxlm[lid + s];
         imaxlm[lid] = imaxlm[lid + s];
       }
@@ -105,7 +105,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
   const int lid = get_local_id(0);
 
   // Performs the first step of the reduction while loading the data
-  if (maxgm[lid + WGS2] >= maxgm[lid]) {
+  if (maxgm[lid + WGS2] > maxgm[lid]) {
     maxlm[lid] = maxgm[lid + WGS2];
     imaxlm[lid] = imaxgm[lid + WGS2];
   }
@@ -118,7 +118,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
   // Performs reduction in local memory
   for (int s=WGS2/2; s>0; s=s>>1) {
     if (lid < s) {
-      if (maxlm[lid + s] >= maxlm[lid]) {
+      if (maxlm[lid + s] > maxlm[lid]) {
         maxlm[lid] = maxlm[lid + s];
         imaxlm[lid] = imaxlm[lid + s];
       }
diff --git a/src/pyclblast/samples/saxpybatched.py b/src/pyclblast/samples/saxpybatched.py
new file mode 100644
index 00000000..fa523945
--- /dev/null
+++ b/src/pyclblast/samples/saxpybatched.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0.
+# This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+
+import numpy as np
+import pyopencl as cl
+from pyopencl.array import Array
+import pyclblast
+
+# Settings for this sample:
+batch_count = 2
+dtype = 'float32'
+alphas = [1.5, 1.0]
+n = 4
+
+print("# Setting up OpenCL")
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+print("# Setting up Numpy arrays")
+x = np.random.rand(n * batch_count).astype(dtype=dtype)
+y = np.random.rand(n * batch_count).astype(dtype=dtype)
+
+print("# Batch offsets: next after each other")
+x_offsets = [0, n]
+y_offsets = [0, n]
+
+print("# Setting up OpenCL arrays")
+clx = Array(queue, x.shape, x.dtype)
+cly = Array(queue, y.shape, y.dtype)
+clx.set(x)
+cly.set(y)
+
+print("# Example level-1 batched operation: AXPY-batched")
+assert len(alphas) == len(x_offsets) == len(y_offsets) == batch_count
+pyclblast.axpyBatched(queue, n, clx, cly, alphas, x_offsets, y_offsets)
+queue.finish()
+
+print("# Full result for vector y: %s" % str(cly.get()))
+for i in range(batch_count):
+	result = alphas[i] * x[x_offsets[i]:x_offsets[i] + n] + y[y_offsets[i]:y_offsets[i] + n]
+	print("# Expected result batch #%d: %s" % (i, str(result)))
diff --git a/src/pyclblast/setup.py b/src/pyclblast/setup.py
index 1c1bf3ab..bcc966ed 100644
--- a/src/pyclblast/setup.py
+++ b/src/pyclblast/setup.py
@@ -22,7 +22,7 @@ ext_modules.append(
 
 setup(
     name="pyclblast",
-    version="1.2.0",
+    version="1.3.0",
     author="Cedric Nugteren",
     author_email="web@cedricnugteren.nl",
     url="https://github.com/CNugteren/CLBlast/blob/master/src/pyclblast",
diff --git a/src/pyclblast/src/pyclblast.pyx b/src/pyclblast/src/pyclblast.pyx
index 14efcf8a..eb46649f 100644
--- a/src/pyclblast/src/pyclblast.pyx
+++ b/src/pyclblast/src/pyclblast.pyx
@@ -364,6 +364,7 @@ def swap(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0):
         err = CLBlastHswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXswap' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -405,6 +406,7 @@ def scal(queue, n, x, x_inc = 1, alpha = 1.0, x_offset = 0):
         err = CLBlastHscal(n, <cl_half>alpha, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXscal' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -448,6 +450,7 @@ def copy(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0):
         err = CLBlastHcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXcopy' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -491,6 +494,7 @@ def axpy(queue, n, x, y, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offs
         err = CLBlastHaxpy(n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXaxpy' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -530,6 +534,7 @@ def dot(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, d
         err = CLBlastHdot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXdot' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -566,6 +571,7 @@ def dotu(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0,
         err = CLBlastZdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXdotu' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -602,6 +608,7 @@ def dotc(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0,
         err = CLBlastZdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXdotc' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -645,6 +652,7 @@ def nrm2(queue, n, x, nrm2, x_inc = 1, x_offset = 0, nrm2_offset = 0):
         err = CLBlastHnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXnrm2' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -688,6 +696,7 @@ def asum(queue, n, x, asum, x_inc = 1, x_offset = 0, asum_offset = 0):
         err = CLBlastHasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXasum' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -731,6 +740,7 @@ def sum(queue, n, x, sum, x_inc = 1, x_offset = 0, sum_offset = 0):
         err = CLBlastHsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsum' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -774,6 +784,7 @@ def amax(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0):
         err = CLBlastiHamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXamax' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -817,6 +828,7 @@ def amin(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0):
         err = CLBlastiHamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXamin' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -860,6 +872,7 @@ def max(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0):
         err = CLBlastiHmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXmax' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -903,6 +916,7 @@ def min(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0):
         err = CLBlastiHmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXmin' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -949,6 +963,7 @@ def gemv(queue, m, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0
         err = CLBlastHgemv(CLBlastLayoutRowMajor, a_transpose, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXgemv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -995,6 +1010,7 @@ def gbmv(queue, m, n, kl, ku, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0,
         err = CLBlastHgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXgbmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1032,6 +1048,7 @@ def hemv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0,
         err = CLBlastZhemv(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXhemv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1069,6 +1086,7 @@ def hbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0
         err = CLBlastZhbmv(CLBlastLayoutRowMajor, triangle, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXhbmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1106,6 +1124,7 @@ def hpmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.
         err = CLBlastZhpmv(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), ap_buffer, ap_offset, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXhpmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1146,6 +1165,7 @@ def symv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0,
         err = CLBlastHsymv(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsymv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1186,6 +1206,7 @@ def sbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0
         err = CLBlastHsbmv(CLBlastLayoutRowMajor, triangle, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsbmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1226,6 +1247,7 @@ def spmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.
         err = CLBlastHspmv(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXspmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1272,6 +1294,7 @@ def trmv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = Fal
         err = CLBlastHtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXtrmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1318,6 +1341,7 @@ def tbmv(queue, n, k, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp =
         err = CLBlastHtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXtbmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1364,6 +1388,7 @@ def tpmv(queue, n, ap, x, ap_ld, x_inc = 1, lower_triangle = False, a_transp = F
         err = CLBlastHtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXtpmv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1407,6 +1432,7 @@ def trsv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = Fal
         err = CLBlastZtrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXtrsv' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1446,6 +1472,7 @@ def ger(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset
         err = CLBlastHger(CLBlastLayoutRowMajor, m, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXger' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1482,6 +1509,7 @@ def geru(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset
         err = CLBlastZgeru(CLBlastLayoutRowMajor, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXgeru' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1518,6 +1546,7 @@ def gerc(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset
         err = CLBlastZgerc(CLBlastLayoutRowMajor, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXgerc' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1553,6 +1582,7 @@ def her(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_
         err = CLBlastZher(CLBlastLayoutRowMajor, triangle, n, <cl_double>alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXher' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1588,6 +1618,7 @@ def hpr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False,
         err = CLBlastZhpr(CLBlastLayoutRowMajor, triangle, n, <cl_double>alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXhpr' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1625,6 +1656,7 @@ def her2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_trian
         err = CLBlastZher2(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXher2' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1662,6 +1694,7 @@ def hpr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_tri
         err = CLBlastZhpr2(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXhpr2' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1700,6 +1733,7 @@ def syr(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_
         err = CLBlastHsyr(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsyr' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1738,6 +1772,7 @@ def spr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False,
         err = CLBlastHspr(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXspr' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1778,6 +1813,7 @@ def syr2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_trian
         err = CLBlastHsyr2(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsyr2' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1818,6 +1854,7 @@ def spr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_tri
         err = CLBlastHspr2(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXspr2' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1865,6 +1902,7 @@ def gemm(queue, m, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, a_t
         err = CLBlastHgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXgemm' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1912,6 +1950,7 @@ def symm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_
         err = CLBlastHsymm(CLBlastLayoutRowMajor, side, triangle, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsymm' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1950,6 +1989,7 @@ def hemm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_
         err = CLBlastZhemm(CLBlastLayoutRowMajor, side, triangle, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_double2>cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXhemm' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -1995,6 +2035,7 @@ def syrk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle
         err = CLBlastHsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsyrk' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -2031,6 +2072,7 @@ def herk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle
         err = CLBlastZherk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, <cl_double>alpha, a_buffer, a_offset, a_ld, <cl_double>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXherk' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -2078,6 +2120,7 @@ def syr2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower
         err = CLBlastHsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXsyr2k' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -2116,6 +2159,7 @@ def her2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower
         err = CLBlastZher2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_double>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXher2k' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -2163,6 +2207,7 @@ def trmm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_t
         err = CLBlastHtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXtrmm' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
@@ -2207,11 +2252,223 @@ def trsm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_t
         err = CLBlastZtrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event)
     else:
         raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
     if err != CLBlastSuccess:
         raise RuntimeError("PyCLBlast: 'CLBlastXtrsm' failed: %s" % get_status_message(err))
     return cl.Event.from_int_ptr(<size_t>event)
 
 ####################################################################################################
+# Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+####################################################################################################
+
+cdef extern from "clblast_c.h":
+    CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+
+def axpyBatched(queue, n, x, y, alphas, x_offsets, y_offsets, x_inc = 1, y_inc = 1):
+    """
+    xAXPYBATCHED: Batched version of AXPY
+    """
+
+    dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"])
+    check_vector(x, "x")
+    check_vector(y, "y")
+
+    if len(x_offsets) != len(y_offsets) != len(alphas):
+        raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: length of batch-sized arguments x_offsets, y_offsets, alphas should be equal")
+    batch_count = len(x_offsets)
+
+    cdef size_t *x_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+    for i in range(batch_count):
+        x_offsets_c[i] = x_offsets[i]
+    cdef size_t *y_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+    for i in range(batch_count):
+        y_offsets_c[i] = y_offsets[i]
+    cdef void *alphas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))
+    for i in range(batch_count):
+        if dtype == np.dtype("float32"):
+            (<cl_float*>alphas_c)[i] = <cl_float>alphas[i]
+        elif dtype == np.dtype("float64"):
+            (<cl_double*>alphas_c)[i] = <cl_double>alphas[i]
+        elif dtype == np.dtype("complex64"):
+            (<cl_float2*>alphas_c)[i] = <cl_float2>cl_float2(x=alphas[i].real,y=alphas[i].imag)
+        elif dtype == np.dtype("complex128"):
+            (<cl_double2*>alphas_c)[i] = <cl_double2>cl_double2(x=alphas[i].real,y=alphas[i].imag)
+        elif dtype == np.dtype("float16"):
+            (<cl_half*>alphas_c)[i] = <cl_half>alphas[i]
+
+    cdef cl_mem x_buffer = <cl_mem><size_t>x.base_data.int_ptr
+    cdef cl_mem y_buffer = <cl_mem><size_t>y.base_data.int_ptr
+
+    cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr
+    cdef cl_event event = NULL
+
+    cdef CLBlastStatusCode err
+    if dtype == np.dtype("float32"):
+        err = CLBlastSaxpyBatched(n, <cl_float*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("float64"):
+        err = CLBlastDaxpyBatched(n, <cl_double*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("complex64"):
+        err = CLBlastCaxpyBatched(n, <cl_float2*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("complex128"):
+        err = CLBlastZaxpyBatched(n, <cl_double2*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("float16"):
+        err = CLBlastHaxpyBatched(n, <cl_half*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+    else:
+        raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
+    PyMem_Free(x_offsets_c)
+    PyMem_Free(y_offsets_c)
+    PyMem_Free(alphas_c)
+
+    if err != CLBlastSuccess:
+        raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: %s" % get_status_message(err))
+    return cl.Event.from_int_ptr(<size_t>event)
+
+####################################################################################################
+# Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+####################################################################################################
+
+cdef extern from "clblast_c.h":
+    CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+
+def gemmBatched(queue, m, n, k, a, b, c, alphas, betas, a_ld, b_ld, c_ld, a_offsets, b_offsets, c_offsets, a_transp = False, b_transp = False):
+    """
+    xGEMMBATCHED: Batched version of GEMM
+    """
+
+    dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"])
+    check_matrix(a, "a")
+    check_matrix(b, "b")
+    check_matrix(c, "c")
+
+    if len(a_offsets) != len(b_offsets) != len(c_offsets) != len(alphas) != len(betas):
+        raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: length of batch-sized arguments a_offsets, b_offsets, c_offsets, alphas, betas should be equal")
+    batch_count = len(a_offsets)
+
+    cdef size_t *a_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+    for i in range(batch_count):
+        a_offsets_c[i] = a_offsets[i]
+    cdef size_t *b_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+    for i in range(batch_count):
+        b_offsets_c[i] = b_offsets[i]
+    cdef size_t *c_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+    for i in range(batch_count):
+        c_offsets_c[i] = c_offsets[i]
+    cdef void *alphas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))
+    for i in range(batch_count):
+        if dtype == np.dtype("float32"):
+            (<cl_float*>alphas_c)[i] = <cl_float>alphas[i]
+        elif dtype == np.dtype("float64"):
+            (<cl_double*>alphas_c)[i] = <cl_double>alphas[i]
+        elif dtype == np.dtype("complex64"):
+            (<cl_float2*>alphas_c)[i] = <cl_float2>cl_float2(x=alphas[i].real,y=alphas[i].imag)
+        elif dtype == np.dtype("complex128"):
+            (<cl_double2*>alphas_c)[i] = <cl_double2>cl_double2(x=alphas[i].real,y=alphas[i].imag)
+        elif dtype == np.dtype("float16"):
+            (<cl_half*>alphas_c)[i] = <cl_half>alphas[i]
+    cdef void *betas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))
+    for i in range(batch_count):
+        if dtype == np.dtype("float32"):
+            (<cl_float*>betas_c)[i] = <cl_float>betas[i]
+        elif dtype == np.dtype("float64"):
+            (<cl_double*>betas_c)[i] = <cl_double>betas[i]
+        elif dtype == np.dtype("complex64"):
+            (<cl_float2*>betas_c)[i] = <cl_float2>cl_float2(x=betas[i].real,y=betas[i].imag)
+        elif dtype == np.dtype("complex128"):
+            (<cl_double2*>betas_c)[i] = <cl_double2>cl_double2(x=betas[i].real,y=betas[i].imag)
+        elif dtype == np.dtype("float16"):
+            (<cl_half*>betas_c)[i] = <cl_half>betas[i]
+
+    cdef cl_mem a_buffer = <cl_mem><size_t>a.base_data.int_ptr
+    cdef cl_mem b_buffer = <cl_mem><size_t>b.base_data.int_ptr
+    cdef cl_mem c_buffer = <cl_mem><size_t>c.base_data.int_ptr
+
+    cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr
+    cdef cl_event event = NULL
+    a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo
+    b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo
+
+    cdef CLBlastStatusCode err
+    if dtype == np.dtype("float32"):
+        err = CLBlastSgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_float*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("float64"):
+        err = CLBlastDgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_double*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("complex64"):
+        err = CLBlastCgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float2*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_float2*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("complex128"):
+        err = CLBlastZgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double2*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_double2*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("float16"):
+        err = CLBlastHgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_half*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+    else:
+        raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
+    PyMem_Free(a_offsets_c)
+    PyMem_Free(b_offsets_c)
+    PyMem_Free(c_offsets_c)
+    PyMem_Free(alphas_c)
+    PyMem_Free(betas_c)
+
+    if err != CLBlastSuccess:
+        raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: %s" % get_status_message(err))
+    return cl.Event.from_int_ptr(<size_t>event)
+
+####################################################################################################
+# StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED
+####################################################################################################
+
+cdef extern from "clblast_c.h":
+    CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+    CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+
+def gemmStridedBatched(queue, m, n, k, batch_count, a, b, c, a_ld, b_ld, c_ld, a_stride, b_stride, c_stride, alpha = 1.0, beta = 0.0, a_transp = False, b_transp = False, a_offset = 0, b_offset = 0, c_offset = 0):
+    """
+    xGEMMSTRIDEDBATCHED: StridedBatched version of GEMM
+    """
+
+    dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"])
+    check_matrix(a, "a")
+    check_matrix(b, "b")
+    check_matrix(c, "c")
+
+    cdef cl_mem a_buffer = <cl_mem><size_t>a.base_data.int_ptr
+    cdef cl_mem b_buffer = <cl_mem><size_t>b.base_data.int_ptr
+    cdef cl_mem c_buffer = <cl_mem><size_t>c.base_data.int_ptr
+
+    cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr
+    cdef cl_event event = NULL
+    a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo
+    b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo
+
+    cdef CLBlastStatusCode err
+    if dtype == np.dtype("float32"):
+        err = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_float>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("float64"):
+        err = CLBlastDgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_double>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("complex64"):
+        err = CLBlastCgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float2>cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_float2>cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("complex128"):
+        err = CLBlastZgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_double2>cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+    elif dtype == np.dtype("float16"):
+        err = CLBlastHgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_half>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+    else:
+        raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
+    if err != CLBlastSuccess:
+        raise RuntimeError("PyCLBlast: 'CLBlastXgemmStridedBatched' failed: %s" % get_status_message(err))
+    return cl.Event.from_int_ptr(<size_t>event)
+
+####################################################################################################
 # Overrides the parameters
 ####################################################################################################
 
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index c5eee527..be6b233e 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -82,7 +82,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
 void print_separator(const size_t parameters_size) {
   printf("x------x-------x");
   for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
-  printf("-x----------------x--------------x--------x-------------------x\n");
+  printf("-x-----------------x-----------------x----------------x--------------x--------x-------------------x\n");
 }
 
 // =================================================================================================
@@ -145,6 +145,7 @@ void Tuner(int argc, char* argv[], const int V,
   const auto platform = Platform(args.platform_id);
   const auto device = Device(platform, args.device_id);
   const auto context = Context(device);
+  auto queue = Queue(context, device);
 
   // Tests for validity of the precision and retrieves properties
   if (!PrecisionSupported<T>(device)) {
@@ -203,12 +204,11 @@ void Tuner(int argc, char* argv[], const int V,
   printf("\n");
   printf("|   ID | total |");
   for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
-  printf("param |       compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
+  printf("param |      local      |      global     |       compiles |         time | %6s |            status |\n", settings.performance_unit.c_str());
   print_separator(settings.parameters.size());
 
   // First runs a reference example to compare against
   try {
-    auto queue = Queue(context, device);
     printf("|  ref |     - |");
     for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf("     "); }
     printf("    - |");
@@ -219,6 +219,16 @@ void Tuner(int argc, char* argv[], const int V,
       device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
     }
 
+    // Sets the thread configuration
+    auto global = settings.global_size_ref;
+    auto local = settings.local_size_ref;
+
+    // Make sure that the global worksize is a multiple of the local
+    for (auto i=size_t{0}; i<global.size(); ++i) {
+      while ((global[i] / local[i]) * local[i] != global[i]) { global[i]++; }
+    }
+    printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]);
+
     // Compiles the kernel
     auto compiler_options = std::vector<std::string>();
     const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
@@ -229,7 +239,7 @@ void Tuner(int argc, char* argv[], const int V,
 
     // Runs the kernel
     const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
-                                    settings.global_size_ref, settings.local_size_ref);
+                                    global, local);
     printf("      - |");
     if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
 
@@ -251,8 +261,6 @@ void Tuner(int argc, char* argv[], const int V,
   auto results = std::vector<TuningResult>();
   for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
     try {
-      auto queue = Queue(context, device);
-
       auto configuration = configurations[config_id];
       printf("| %4zu | %5zu |", config_id + 1, configurations.size());
       for (const auto& parameter : settings.parameters) {
@@ -266,10 +274,16 @@ void Tuner(int argc, char* argv[], const int V,
       }
 
       // Sets the thread configuration
-      const auto global = SetThreadConfiguration(configuration, settings.global_size,
-                                                 settings.mul_global, settings.div_global);
-      const auto local = SetThreadConfiguration(configuration, settings.local_size,
-                                                settings.mul_local, settings.div_local);
+      auto global = SetThreadConfiguration(configuration, settings.global_size,
+                                           settings.mul_global, settings.div_global);
+      auto local = SetThreadConfiguration(configuration, settings.local_size,
+                                          settings.mul_local, settings.div_local);
+
+      // Make sure that the global worksize is a multiple of the local
+      for (auto i=size_t{0}; i<global.size(); ++i) {
+        while ((global[i] / local[i]) * local[i] != global[i]) { global[i]++; }
+      }
+      printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]);
 
       // Sets the parameters for this configuration
       auto kernel_source = std::string{""};
author	Gard Spreemann <gspr@nonempty.org>	2021-01-20 15:42:05 +0100
committer	Gard Spreemann <gspr@nonempty.org>	2021-01-20 15:42:05 +0100
commit	c8e17e202f2ac8ab338a1a444b6be37a97e38226 (patch)
tree	9ecce4c2b90103725817782cc4412835ec95fac0
parent	b991afa34ae943548146e4f71bbbba8f1f4b1000 (diff)
parent	70016e869881df837402def4904b2888247e02d9 (diff)