summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGard Spreemann <gspr@nonempty.org>2021-01-20 15:42:05 +0100
committerGard Spreemann <gspr@nonempty.org>2021-01-20 15:42:05 +0100
commitc8e17e202f2ac8ab338a1a444b6be37a97e38226 (patch)
tree9ecce4c2b90103725817782cc4412835ec95fac0
parentb991afa34ae943548146e4f71bbbba8f1f4b1000 (diff)
parent70016e869881df837402def4904b2888247e02d9 (diff)
Merge tag '1.5.2' into debian/sid
-rw-r--r--.appveyor.yml4
-rw-r--r--.github/FUNDING.yml1
-rw-r--r--.travis.yml2
-rw-r--r--CHANGELOG10
-rw-r--r--CMakeLists.txt4
-rw-r--r--doc/api.md8
-rw-r--r--doc/tuning.md2
-rw-r--r--include/clblast.h5
-rw-r--r--include/clblast_c.h5
-rw-r--r--samples/samax.c102
-rw-r--r--scripts/benchmark/benchmark.py6
-rw-r--r--scripts/benchmark/plot.py8
-rw-r--r--scripts/benchmark/utils.py3
-rwxr-xr-xscripts/generator/generator.py10
-rw-r--r--scripts/generator/generator/pyclblast.py70
-rw-r--r--scripts/generator/generator/routine.py43
-rw-r--r--src/clpp11.hpp32
-rw-r--r--src/database/database_structure.hpp1
-rw-r--r--src/database/kernels/copy/copy_16.hpp4
-rw-r--r--src/database/kernels/copy/copy_32.hpp9
-rw-r--r--src/database/kernels/copy/copy_3232.hpp8
-rw-r--r--src/database/kernels/copy/copy_64.hpp10
-rw-r--r--src/database/kernels/copy/copy_6464.hpp8
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_32.hpp4
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_3232.hpp8
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_64.hpp8
-rw-r--r--src/database/kernels/gemm_routine/gemm_routine_6464.hpp4
-rw-r--r--src/database/kernels/invert/invert_32.hpp4
-rw-r--r--src/database/kernels/invert/invert_3232.hpp4
-rw-r--r--src/database/kernels/invert/invert_64.hpp4
-rw-r--r--src/database/kernels/invert/invert_6464.hpp4
-rw-r--r--src/database/kernels/pad/pad_16.hpp4
-rw-r--r--src/database/kernels/pad/pad_32.hpp8
-rw-r--r--src/database/kernels/pad/pad_3232.hpp8
-rw-r--r--src/database/kernels/pad/pad_64.hpp10
-rw-r--r--src/database/kernels/pad/pad_6464.hpp8
-rw-r--r--src/database/kernels/padtranspose/padtranspose_16.hpp4
-rw-r--r--src/database/kernels/padtranspose/padtranspose_32.hpp8
-rw-r--r--src/database/kernels/padtranspose/padtranspose_3232.hpp8
-rw-r--r--src/database/kernels/padtranspose/padtranspose_64.hpp8
-rw-r--r--src/database/kernels/padtranspose/padtranspose_6464.hpp8
-rw-r--r--src/database/kernels/transpose/transpose_16.hpp4
-rw-r--r--src/database/kernels/transpose/transpose_32.hpp8
-rw-r--r--src/database/kernels/transpose/transpose_3232.hpp8
-rw-r--r--src/database/kernels/transpose/transpose_64.hpp10
-rw-r--r--src/database/kernels/transpose/transpose_6464.hpp8
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_32.hpp4
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_3232.hpp4
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_64.hpp4
-rw-r--r--src/database/kernels/trsv_routine/trsv_routine_6464.hpp4
-rw-r--r--src/database/kernels/xaxpy/xaxpy_16.hpp4
-rw-r--r--src/database/kernels/xaxpy/xaxpy_32.hpp10
-rw-r--r--src/database/kernels/xaxpy/xaxpy_3232.hpp10
-rw-r--r--src/database/kernels/xaxpy/xaxpy_64.hpp8
-rw-r--r--src/database/kernels/xaxpy/xaxpy_6464.hpp8
-rw-r--r--src/database/kernels/xdot/xdot_16.hpp4
-rw-r--r--src/database/kernels/xdot/xdot_32.hpp8
-rw-r--r--src/database/kernels/xdot/xdot_3232.hpp8
-rw-r--r--src/database/kernels/xdot/xdot_64.hpp8
-rw-r--r--src/database/kernels/xdot/xdot_6464.hpp8
-rw-r--r--src/database/kernels/xgemm/xgemm_32.hpp8
-rw-r--r--src/database/kernels/xgemm/xgemm_3232.hpp4
-rw-r--r--src/database/kernels/xgemm/xgemm_64.hpp4
-rw-r--r--src/database/kernels/xgemm/xgemm_6464.hpp4
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_32.hpp4
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp4
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_64.hpp4
-rw-r--r--src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp4
-rw-r--r--src/database/kernels/xgemv/xgemv_32.hpp4
-rw-r--r--src/database/kernels/xgemv/xgemv_3232.hpp4
-rw-r--r--src/database/kernels/xgemv/xgemv_64.hpp6
-rw-r--r--src/database/kernels/xgemv/xgemv_6464.hpp6
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_32.hpp4
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp4
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_64.hpp6
-rw-r--r--src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp4
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp4
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp4
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp4
-rw-r--r--src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp6
-rw-r--r--src/database/kernels/xger/xger_16.hpp6
-rw-r--r--src/database/kernels/xger/xger_32.hpp10
-rw-r--r--src/database/kernels/xger/xger_3232.hpp8
-rw-r--r--src/database/kernels/xger/xger_64.hpp8
-rw-r--r--src/database/kernels/xger/xger_6464.hpp8
-rw-r--r--src/kernels/level1/xamax.opencl8
-rw-r--r--src/pyclblast/samples/saxpybatched.py46
-rw-r--r--src/pyclblast/setup.py2
-rw-r--r--src/pyclblast/src/pyclblast.pyx257
-rw-r--r--src/tuning/tuning.cpp34
90 files changed, 1007 insertions, 71 deletions
diff --git a/.appveyor.yml b/.appveyor.yml
index 8ff37320..0fc3ca55 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -57,8 +57,8 @@ build_script:
after_build:
- ps: pushd $env:CLBLAST_BUILD
- - 7z a CLBlast-1.5.1-Windows-x64.zip .\install_dir\*
- - ps: mv CLBlast-1.5.1-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
+ - 7z a CLBlast-1.5.2-Windows-x64.zip .\install_dir\*
+ - ps: mv CLBlast-1.5.2-Windows-x64.zip $env:APPVEYOR_BUILD_FOLDER
artifacts:
- path: '*.zip'
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000..be34db57
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1 @@
+github: CNugteren
diff --git a/.travis.yml b/.travis.yml
index e6d7fd16..f937222d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,7 +21,7 @@ matrix:
env:
global:
- - CLBLAST_VERSION=1.5.1
+ - CLBLAST_VERSION=1.5.2
- CLBLAST_ROOT=${TRAVIS_BUILD_DIR}/bin/clblast
- CLBLAST_INSTALL=${TRAVIS_BUILD_DIR}/bin/CLBlast-${CLBLAST_VERSION}
- CLBLAST_TAR=CLBlast-${CLBLAST_VERSION}-${TRAVIS_OS_NAME}-x64.tar.gz
diff --git a/CHANGELOG b/CHANGELOG
index de932972..e3614d80 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,13 @@
+Version 1.5.2
+- Changed XAMAX/XAMIN to more likely return first rather than last min/max index, updated API docs
+- Added batched routines to pyclblast
+- Added CLBLAST_VERSION_MAJOR/MINOR/PATCH defines in headers to store version numbering
+- Several small improvements to the benchmark script (thanks to 'baryluk')
+- Fixed a bug in the caching when using a context with multiple devices
+- Fixed a bug in the tuners related to global workgroup size not being a multiple of the local
+- Various minor fixes and enhancements
+- Added tuned parameters for various devices (see doc/tuning.md)
+
Version 1.5.1
- Implemented single-kernel version of convolution as GEMM
- Now catches all exceptions thrown by the tuners
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09d93d88..2d280503 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,7 @@ endif()
project("clblast" C CXX)
set(clblast_VERSION_MAJOR 1)
set(clblast_VERSION_MINOR 5)
-set(clblast_VERSION_PATCH 1)
+set(clblast_VERSION_PATCH 2)
set(clblast_VERSION "${clblast_VERSION_MAJOR}.${clblast_VERSION_MINOR}.${clblast_VERSION_PATCH}")
set(clblast_SOVERSION ${clblast_VERSION_MAJOR})
@@ -228,7 +228,7 @@ set(PRECISIONS 32 64 3232 6464 16)
# Sample programs
if(OPENCL)
set(SAMPLE_PROGRAMS_CPP sgemm sgemm_batched dtrsm tuning_api)
- set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+ set(SAMPLE_PROGRAMS_C sasum samax dgemv sgemm haxpy cache)
if(NETLIB)
set(SAMPLE_PROGRAMS_C ${SAMPLE_PROGRAMS_C} sgemm_netlib)
endif()
diff --git a/doc/api.md b/doc/api.md
index 996505f1..9d4bedfa 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -511,7 +511,7 @@ Arguments to SUM:
xAMAX: Index of absolute maximum value in a vector
-------------
-Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
+Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
C++ API:
```
@@ -562,7 +562,7 @@ Arguments to AMAX:
xAMIN: Index of absolute minimum value in a vector (non-BLAS function)
-------------
-Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
+Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
C++ API:
```
@@ -613,7 +613,7 @@ Arguments to AMIN:
xMAX: Index of maximum value in a vector (non-BLAS function)
-------------
-Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
+Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
C++ API:
```
@@ -664,7 +664,7 @@ Arguments to MAX:
xMIN: Index of minimum value in a vector (non-BLAS function)
-------------
-Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
+Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
C++ API:
```
diff --git a/doc/tuning.md b/doc/tuning.md
index 20af9bd5..793bce88 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -30,6 +30,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
- GeForce GTX TITAN Black
- GeForce GTX TITAN X
- TITAN X (Pascal)
+ - TITAN RTX
- Tesla K20m
- Tesla K40m
- Tesla P100 16GB
@@ -44,6 +45,7 @@ The CLBlast library is already tuned for the most commonly used OpenCL devices a
- Radeon RX 480
- Radeon R9 Fury X
- Radeon Pro 580
+ - Radeon RX Vega
* Intel GPUs:
- HD Graphics 530
- HD Graphics 5500 BroadWell U-Processor GT2
diff --git a/include/clblast.h b/include/clblast.h
index 7a82361c..296f3987 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -39,6 +39,11 @@
#define PUBLIC_API
#endif
+// Version numbering (v1.5.1)
+#define CLBLAST_VERSION_MAJOR 1
+#define CLBLAST_VERSION_MINOR 5
+#define CLBLAST_VERSION_PATCH 1
+
namespace clblast {
// =================================================================================================
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 2ba6375a..7f7b1e97 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -34,6 +34,11 @@
#define PUBLIC_API
#endif
+// Version numbering (v1.5.1)
+#define CLBLAST_VERSION_MAJOR 1
+#define CLBLAST_VERSION_MINOR 5
+#define CLBLAST_VERSION_PATCH 1
+
// The C interface
#ifdef __cplusplus
extern "C" {
diff --git a/samples/samax.c b/samples/samax.c
new file mode 100644
index 00000000..36e78846
--- /dev/null
+++ b/samples/samax.c
@@ -0,0 +1,102 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the iSAMAX routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_TARGET_OPENCL_VERSION 110
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine iSAMAX
+int main(void) {
+
+ // OpenCL platform/device settings
+ const size_t platform_id = 0;
+ const size_t device_id = 0;
+
+ // Example iSAMAX arguments
+ const size_t n = 1000;
+
+ // Initializes the OpenCL platform
+ cl_uint num_platforms;
+ clGetPlatformIDs(0, NULL, &num_platforms);
+ cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+ clGetPlatformIDs(num_platforms, platforms, NULL);
+ cl_platform_id platform = platforms[platform_id];
+
+ // Initializes the OpenCL device
+ cl_uint num_devices;
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+ cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+ cl_device_id device = devices[device_id];
+
+ // Creates the OpenCL context, queue, and an event
+ cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+ cl_event event = NULL;
+
+ // Populate host data structures with some example data
+ float* host_input = (float*)malloc(sizeof(float)*n);
+ unsigned int* host_output = (unsigned int*)malloc(sizeof(unsigned int)*1);
+ for (size_t i=0; i<n; ++i) { host_input[i] = (float)(i % 10); } // staircase modulo 10
+ for (size_t i=0; i<1; ++i) { host_output[i] = 77; } // some temp value to be overwritten later
+
+ // Copy the data-structures to the device
+ cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+ cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(unsigned int), NULL, NULL);
+ clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+ clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL);
+
+ // Call the iSAMAX routine.
+ CLBlastStatusCode status = CLBlastiSamax(n,
+ device_output, 0,
+ device_input, 0, 1,
+ &queue, &event);
+
+ // Wait for completion
+ if (status == CLBlastSuccess) {
+ clWaitForEvents(1, &event);
+ clReleaseEvent(event);
+ }
+
+ // Copies the result back to the host
+ clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(unsigned int), host_output, 0, NULL, NULL);
+
+ // Example completed. See "clblast_c.h" for status codes (0 -> success).
+ printf("Completed iSAMAX with status %d: array of %d values with staircases from 0..9 repeated, max at index %zu with value %.0lf\n",
+ status, n, host_output[0], host_input[host_output[0]]);
+
+ // Clean-up
+ free(platforms);
+ free(devices);
+ free(host_input);
+ free(host_output);
+ clReleaseMemObject(device_input);
+ clReleaseMemObject(device_output);
+ clReleaseCommandQueue(queue);
+ clReleaseContext(context);
+ return 0;
+}
+
+// =================================================================================================
diff --git a/scripts/benchmark/benchmark.py b/scripts/benchmark/benchmark.py
index 0bb37c10..81f1ec90 100644
--- a/scripts/benchmark/benchmark.py
+++ b/scripts/benchmark/benchmark.py
@@ -63,7 +63,7 @@ def run_benchmark(name, arguments_list, precision, num_runs, platform, device, c
all_arguments = [arg if arg != "-precision 16" else "-precision 32" for arg in all_arguments]
benchmark_output = utils.run_binary(binary, all_arguments)
result_extra = utils.parse_results(benchmark_output)
- for index in range(len(min(result, result_extra))):
+ for index in range(min(len(result), len(result_extra))):
result[index]["GBs_1_FP32"] = result_extra[index]["GBs_1"]
result[index]["GFLOPS_1_FP32"] = result_extra[index]["GFLOPS_1"]
for id in COMPARISON_IDS:
@@ -83,7 +83,7 @@ def parse_arguments(argv):
parser.add_argument("-d", "--device", required=True, type=int, help="The ID of the OpenCL device to test on")
parser.add_argument("-n", "--num_runs", type=int, default=None, help="Overrides the default number of benchmark repeats for averaging")
parser.add_argument("-x", "--precision", type=int, default=32, help="The precision to test for (choose from 16, 32, 64, 3232, 6464")
- parser.add_argument("-l", "--load_from_disk", action="store_true", help="Increase verbosity of the script")
+ parser.add_argument("-l", "--load_from_disk", action="store_true", help="Loading existing results from JSON file and replot")
parser.add_argument("-t", "--plot_title", default="", help="The title for the plots, defaults to benchmark name")
parser.add_argument("-z", "--tight_plot", action="store_true", help="Enables tight plot layout for in paper or presentation")
parser.add_argument("-o", "--output_folder", default=os.getcwd(), help="Sets the folder for output plots (defaults to current folder)")
@@ -103,7 +103,7 @@ def benchmark_single(benchmark, comparisons, platform, device, num_runs, precisi
# The benchmark name and plot title
benchmark_name = utils.precision_to_letter(precision) + benchmark.upper()
if benchmark.upper() != "SUMMARY":
- plot_title = benchmark_name if plot_title is "" else benchmark_name + ": " + plot_title
+ plot_title = benchmark_name if plot_title == "" else benchmark_name + ": " + plot_title
# Retrieves the comparison settings
library_ids = [1]
diff --git a/scripts/benchmark/plot.py b/scripts/benchmark/plot.py
index 6337b78f..b0b63df3 100644
--- a/scripts/benchmark/plot.py
+++ b/scripts/benchmark/plot.py
@@ -10,6 +10,7 @@ import matplotlib
matplotlib.use('Agg')
from matplotlib import rcParams
import matplotlib.pyplot as plt
+import numpy as np
# Colors
BLUEISH = [c / 255.0 for c in [71, 101, 177]] # #4765b1
@@ -24,7 +25,7 @@ def plot_graphs(results, file_name, num_rows, num_cols,
x_keys, y_keys, titles, x_labels, y_labels,
label_names, title, tight_plot, verbose):
assert len(results) == num_rows * num_cols
- assert len(results) != 1
+ assert len(results) >= 1
assert len(x_keys) == len(results)
assert len(y_keys) == len(results)
assert len(titles) == len(results)
@@ -64,6 +65,9 @@ def plot_graphs(results, file_name, num_rows, num_cols,
size_y = plot_size * num_rows
rcParams.update({'font.size': font_size})
fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(size_x, size_y), facecolor='w', edgecolor='k')
+ if len(results) == 1 and not type(axes) is np.ndarray:
+ axes = np.full((1,1), axes)
+ assert type(axes) is np.ndarray
fig.text(.5, 0.92, title, horizontalalignment="center", fontsize=font_size_title)
plt.subplots_adjust(wspace=w_space, hspace=h_space)
@@ -72,7 +76,7 @@ def plot_graphs(results, file_name, num_rows, num_cols,
for col in range(num_cols):
index = row * num_cols + col
result = results[index]
- ax = axes.flat[index]
+ ax = axes[row, col]
plt.sca(ax)
print("[plot] Plotting subplot %d" % index)
diff --git a/scripts/benchmark/utils.py b/scripts/benchmark/utils.py
index 11aad805..fe0bd7f7 100644
--- a/scripts/benchmark/utils.py
+++ b/scripts/benchmark/utils.py
@@ -50,7 +50,8 @@ def run_binary(command, arguments):
full_command = command + " " + " ".join(arguments)
print("[benchmark] Calling binary: %s" % str(full_command))
try:
- return subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+ result = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE).stdout.read()
+ return result.decode("ascii")
except OSError as e:
print("[benchmark] Error while running the binary, got exception: %s" + str(e))
return False
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 76c5dc1c..875b1a55 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -49,7 +49,7 @@ FILES = [
"/src/clblast_cuda.cpp",
"/src/pyclblast/src/pyclblast.pyx"
]
-HEADER_LINES = [124, 21, 128, 24, 29, 45, 29, 66, 40, 96, 21, 327]
+HEADER_LINES = [129, 21, 133, 24, 29, 45, 29, 66, 40, 96, 21, 327]
FOOTER_LINES = [98, 57, 112, 275, 6, 6, 6, 9, 2, 41, 56, 37]
HEADER_LINES_DOC = 0
FOOTER_LINES_DOC = 232
@@ -132,10 +132,10 @@ ROUTINES = [
Routine(True, True, 0, False, "1", "nrm2", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["nrm2"], [xn,"1"], [], "2*n", "Euclidian norm of a vector", "Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.", []),
Routine(True, True, 0, False, "1", "asum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["asum"], [xn,"1"], [], "n", "Absolute sum of values in a vector", "Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.", []),
Routine(True, False, 0, False, "1", "sum", T, [S,D,Sc,Dz,H], ["n"], [], ["x"], ["sum"], [xn,"1"], [], "n", "Sum of values in a vector (non-BLAS function)", "Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.", []),
- Routine(True, True, 0, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
- Routine(True, False, 0, False, "1", "amin", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []),
- Routine(True, False, 0, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
- Routine(True, False, 0, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
+ Routine(True, True, 0, False, "1", "amax", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of absolute maximum value in a vector", "Finds the index of a maximum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.", []),
+ Routine(True, False, 0, False, "1", "amin", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of absolute minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.", []),
+ Routine(True, False, 0, False, "1", "max", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imax"], [xn,"1"], [], "2*n", "Index of maximum value in a vector (non-BLAS function)", "Finds the index of a maximum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.", []),
+ Routine(True, False, 0, False, "1", "min", T, [iS,iD,iC,iZ,iH], ["n"], [], ["x"], ["imin"], [xn,"1"], [], "2*n", "Index of minimum value in a vector (non-BLAS function)", "Finds the index of a minimum (not necessarily the first if there are multiple) of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.", []),
],
[ # Level 2: matrix-vector
Routine(True, True, 0, False, "2a", "gemv", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a","x"], ["y"], [amn,xmn,ynm], ["alpha","beta"], "", "General matrix-vector multiplication", "Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.", [ald_m]),
diff --git a/scripts/generator/generator/pyclblast.py b/scripts/generator/generator/pyclblast.py
index 47eb2eb4..b7ec348e 100644
--- a/scripts/generator/generator/pyclblast.py
+++ b/scripts/generator/generator/pyclblast.py
@@ -22,6 +22,16 @@ def to_np_dtype(flavour):
}[flavour.precision_name]
+def cl_type(flavour):
+ return {
+ "S": "cl_float",
+ "D": "cl_double",
+ "C": "cl_float2",
+ "Z": "cl_double2",
+ "H": "cl_half",
+ }[flavour.precision_name]
+
+
def scalar_cython_conversion(scalar, flavour):
scalar_type = flavour.alpha_cl if scalar == "alpha" else flavour.beta_cl
if scalar_type == "float":
@@ -39,7 +49,9 @@ def scalar_cython_conversion(scalar, flavour):
def generate_pyx(routine):
result = ""
- if routine.implemented and routine.plain_name() and routine.level in ["1", "2a", "2b", "3"]:
+ if routine.implemented and routine.plain_name() and routine.level in ["1", "2a", "2b", "3", "x"]:
+ if routine.level == "x" and routine.batched == 0:
+ return result # level-X routines that are non-batched are not supported at the moment
indent = " "
result += SEPARATOR + NL
@@ -80,6 +92,33 @@ def generate_pyx(routine):
result += buf + ", \"" + buf + "\")" + NL
result += NL
+ # Batched checks
+ if routine.batched == 1: # batched but not strided-batched
+ lists = [b + "_offsets" for b in buffers] + [s + "s" for s in routine.scalars]
+ result += indent + "if " + " != ".join(["len(" + l + ")" for l in lists]) + ":" + NL
+ result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: length of batch-sized arguments " + ", ".join(lists) + " should be equal\")" + NL
+ result += indent + "batch_count = len(" + lists[0] + ")" + NL
+ result += NL
+
+ # Batched list to pointer conversions
+ for buf in buffers:
+ result += indent + "cdef size_t *" + buf + "_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))" + NL
+ result += indent + "for i in range(batch_count):" + NL
+ result += indent + indent + "" + buf + "_offsets_c[i] = " + buf + "_offsets[i]" + NL
+ for scalar in routine.scalars:
+ result += indent + "cdef void *" + scalar + "s_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))" + NL
+ result += indent + "for i in range(batch_count):" + NL
+ if_prefix = ""
+ for flavour in routine.flavours:
+ if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
+ np_dtype = to_np_dtype(flavour)
+ result += indent + indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL
+ scalar_converted = scalar_cython_conversion(scalar + "s[i]", flavour)
+ result += indent + indent + indent + "(<" + cl_type(flavour) + "*>" + scalar + "s_c)[i] = " + scalar_converted + NL
+ if_prefix = "el"
+
+ result += NL
+
# Buffer transformation
for buf in buffers:
result += indent + "cdef cl_mem " + buf + "_buffer = <cl_mem><size_t>" + buf + ".base_data.int_ptr" + NL
@@ -108,11 +147,22 @@ def generate_pyx(routine):
for flavour in routine.flavours:
if flavour.precision_name in ["S", "D", "C", "Z", "H"]:
np_dtype = to_np_dtype(flavour)
- argument_names = [x.
- replace("layout", "CLBlastLayoutRowMajor").
- replace("alpha", scalar_cython_conversion("alpha", flavour)).
- replace("beta", scalar_cython_conversion("beta", flavour))
- for x in routine.arguments()]
+ if routine.batched != 1: # regular or strided-batched
+ argument_names = [x.
+ replace("layout", "CLBlastLayoutRowMajor").
+ replace("alpha", scalar_cython_conversion("alpha", flavour)).
+ replace("beta", scalar_cython_conversion("beta", flavour))
+ for x in routine.arguments()]
+ else: # batched but not strided-batched
+ argument_names = [x.
+ replace("layout", "CLBlastLayoutRowMajor").
+ replace("_cpp", "_c").
+ replace("_offsets", "_offsets_c").
+ replace("alphas_c", "<" + cl_type(flavour) + "*>alphas_c").
+ replace("betas_c", "<" + cl_type(flavour) + "*>betas_c")
+ for x in routine.arguments()]
+ if routine.batched > 0:
+ argument_names.append("batch_count")
result += indent + if_prefix + "if dtype == np.dtype(\"" + np_dtype + "\"):" + NL
result += indent + indent + "err = CLBlast" + flavour.name + routine.plain_name()
result += "(" + ", ".join(argument_names) + ", &command_queue, &event)" + NL
@@ -120,6 +170,14 @@ def generate_pyx(routine):
result += indent + "else:" + NL
result += indent + indent + "raise ValueError(\"PyCLBlast: Unrecognized data-type '%s'\" % dtype)" + NL
+ result += NL
+
+ # Cleaning up
+ if routine.batched == 1: # batched but not strided-batched
+ for array in [b + "_offset" for b in buffers] + routine.scalars:
+ result += indent + "PyMem_Free(" + array + "s_c)" + NL
+ result += NL
+
result += indent + "if err != CLBlastSuccess:" + NL
result += indent + indent + "raise RuntimeError(\"PyCLBlast: 'CLBlastX" + routine.plain_name() + "' failed: %s\" % get_status_message(err))" + NL
result += indent + "return cl.Event.from_int_ptr(<size_t>event)" + NL
diff --git a/scripts/generator/generator/routine.py b/scripts/generator/generator/routine.py
index 3b5a6b76..8b6ab57f 100644
--- a/scripts/generator/generator/routine.py
+++ b/scripts/generator/generator/routine.py
@@ -825,17 +825,37 @@ class Routine:
"""Arguments for the Python wrapper pyclblast"""
result = list()
result.extend(self.sizes)
+ if self.batched == 2: # strided batched
+ result.append("batch_count")
buffers = self.inputs + self.outputs
result.extend(buffers[:])
- for buf in buffers:
- if buf in self.buffers_matrix():
- result.append(buf + "_ld")
- for buf in buffers:
- if buf in self.buffers_vector():
- result.append(buf + "_inc = 1")
- for scalar in self.scalars:
- default = "1.0" if scalar == "alpha" else "0.0"
- result.append(scalar + " = " + default)
+ if self.batched != 1: # regular or strided-batched
+ for buf in buffers:
+ if buf in self.buffers_matrix():
+ result.append(buf + "_ld")
+ for buf in buffers:
+ if buf in self.buffers_vector():
+ result.append(buf + "_inc = 1")
+ if self.batched == 2: # strided batched
+ for buf in buffers:
+ if buf in self.buffers_matrix():
+ result.append(buf + "_stride")
+ for scalar in self.scalars:
+ if scalar != "":
+ default = "1.0" if scalar == "alpha" else "0.0"
+ result.append(scalar + " = " + default)
+ else: # batched but not strided-batched
+ for scalar in self.scalars:
+ result.append(scalar + "s")
+ for buf in buffers:
+ if buf in self.buffers_matrix():
+ result.append(buf + "_ld")
+ for buf in buffers:
+ if buf in self.buffers_vector() + self.buffers_matrix():
+ result.append(buf + "_offsets")
+ for buf in buffers:
+ if buf in self.buffers_vector():
+ result.append(buf + "_inc = 1")
for option in self.options:
if option == "a_transpose":
result.append("a_transp = False")
@@ -849,8 +869,9 @@ class Routine:
result.append("lower_triangle = False")
if option == "diagonal":
result.append("unit_diagonal = False")
- for buf in buffers:
- result.append(buf + "_offset = 0")
+ if self.batched != 1:
+ for buf in buffers:
+ result.append(buf + "_offset = 0")
return result
def requirements_doc(self):
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index aeb14989..2a25606c 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -47,6 +47,7 @@
#include <assert.h>
// OpenCL
+#define CL_TARGET_OPENCL_VERSION 110
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS // to disable deprecation warnings
@@ -508,12 +509,35 @@ class Program {
// Retrieves a binary or an intermediate representation of the compiled program
std::string GetIR() const {
- auto bytes = size_t{0};
- CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
+ cl_uint num_devices = 0;
+ CheckError(clGetProgramInfo(program_, CL_PROGRAM_NUM_DEVICES,
+ sizeof(cl_uint), &num_devices, nullptr));
+
+ std::vector<size_t> binSizesInBytes(num_devices, 0);
+ CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES,
+ num_devices * sizeof(size_t), binSizesInBytes.data(), nullptr));
+
+ auto bytes = size_t{0};
+ auto binSizeIter = size_t{0};
+ // Loop over the program binary sizes to find a binary whose size is > 0.
+ // The current logic assumes that there ever is only one valid program binary
+ // in a given cl_program. This should be the case unless the cl_program
+ // is built for all or a subset of devices associated to a given cl_program
+ for (; binSizeIter < binSizesInBytes.size(); ++binSizeIter) {
+ if (binSizesInBytes[binSizeIter] > 0) {
+ bytes = binSizesInBytes[binSizeIter];
+ break;
+ }
+ }
auto result = std::string{};
result.resize(bytes);
- auto result_ptr = result.data();
- CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
+
+ std::vector<char*> out(num_devices, nullptr);
+ out[binSizeIter] = const_cast<char*>(result.data());
+
+ CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES,
+ num_devices * sizeof(char*),
+ out.data(), nullptr));
return result;
}
diff --git a/src/database/database_structure.hpp b/src/database/database_structure.hpp
index 0199ec3a..ec6a16d2 100644
--- a/src/database/database_structure.hpp
+++ b/src/database/database_structure.hpp
@@ -21,6 +21,7 @@
// Just needed for 'Precision'
#ifdef OPENCL_API
+ #define CL_TARGET_OPENCL_VERSION 110
#include "clblast.h"
#elif CUDA_API
#include "clblast_cuda.h"
diff --git a/src/database/kernels/copy/copy_16.hpp b/src/database/kernels/copy/copy_16.hpp
index 564bea6b..a13072f0 100644
--- a/src/database/kernels/copy/copy_16.hpp
+++ b/src/database/kernels/copy/copy_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry CopyHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/copy/copy_32.hpp b/src/database/kernels/copy/copy_32.hpp
index 3ef38ccb..9f24afea 100644
--- a/src/database/kernels/copy/copy_32.hpp
+++ b/src/database/kernels/copy/copy_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry CopySingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -87,6 +91,7 @@ const DatabaseEntry CopySingle = {
{ // Intel GPUs
kDeviceTypeGPU, "Intel", {
{ "default", {
+ { Name{"Intel(R) Gen9 HD Graphics NEO "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 530 "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 5500 BroadWell U-Processor GT"}, Params{ 32, 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ Name{"Intel(R) HD Graphics 6000 BroadWell U-Processor GT"}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -154,6 +159,10 @@ const DatabaseEntry CopySingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/copy/copy_3232.hpp b/src/database/kernels/copy/copy_3232.hpp
index 81feba9f..08ee3c25 100644
--- a/src/database/kernels/copy/copy_3232.hpp
+++ b/src/database/kernels/copy/copy_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry CopyComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -149,6 +153,10 @@ const DatabaseEntry CopyComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/copy/copy_64.hpp b/src/database/kernels/copy/copy_64.hpp
index 52ce191e..f6ef641e 100644
--- a/src/database/kernels/copy/copy_64.hpp
+++ b/src/database/kernels/copy/copy_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry CopyDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 32, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 8, 4, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,8 +140,12 @@ const DatabaseEntry CopyDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/copy/copy_6464.hpp b/src/database/kernels/copy/copy_6464.hpp
index d0e31201..8607efc5 100644
--- a/src/database/kernels/copy/copy_6464.hpp
+++ b/src/database/kernels/copy/copy_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry CopyComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry CopyComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_32.hpp b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
index 97b9c5ed..68e18f01 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_32.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_32.hpp
@@ -61,6 +61,10 @@ const DatabaseEntry GemmRoutineSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 1664, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1536, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
index c3e60fe5..3f5e32bc 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_3232.hpp
@@ -48,15 +48,19 @@ const DatabaseEntry GemmRoutineComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 1472, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1856, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 1280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/gemm_routine/gemm_routine_64.hpp b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
index bd3df89a..036da90f 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_64.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_64.hpp
@@ -39,15 +39,19 @@ const DatabaseEntry GemmRoutineDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1344, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 896, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 960, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
index 044fa5d8..facfd8c8 100644
--- a/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
+++ b/src/database/kernels/gemm_routine/gemm_routine_6464.hpp
@@ -39,6 +39,10 @@ const DatabaseEntry GemmRoutineComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 576, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 832, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/invert/invert_32.hpp b/src/database/kernels/invert/invert_32.hpp
index aaaec3e5..aeb8a6d4 100644
--- a/src/database/kernels/invert/invert_32.hpp
+++ b/src/database/kernels/invert/invert_32.hpp
@@ -50,6 +50,10 @@ const DatabaseEntry InvertSingle = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/invert/invert_3232.hpp b/src/database/kernels/invert/invert_3232.hpp
index 5666e9a2..575b104a 100644
--- a/src/database/kernels/invert/invert_3232.hpp
+++ b/src/database/kernels/invert/invert_3232.hpp
@@ -49,6 +49,10 @@ const DatabaseEntry InvertComplexSingle = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/invert/invert_64.hpp b/src/database/kernels/invert/invert_64.hpp
index 2cd5e369..96a5223a 100644
--- a/src/database/kernels/invert/invert_64.hpp
+++ b/src/database/kernels/invert/invert_64.hpp
@@ -40,6 +40,10 @@ const DatabaseEntry InvertDouble = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/invert/invert_6464.hpp b/src/database/kernels/invert/invert_6464.hpp
index 4e518e81..ebacda6c 100644
--- a/src/database/kernels/invert/invert_6464.hpp
+++ b/src/database/kernels/invert/invert_6464.hpp
@@ -40,6 +40,10 @@ const DatabaseEntry InvertComplexDouble = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 1, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/pad/pad_16.hpp b/src/database/kernels/pad/pad_16.hpp
index ca5acf75..17367a44 100644
--- a/src/database/kernels/pad/pad_16.hpp
+++ b/src/database/kernels/pad/pad_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry PadHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/pad/pad_32.hpp b/src/database/kernels/pad/pad_32.hpp
index 7e554e31..52529d1e 100644
--- a/src/database/kernels/pad/pad_32.hpp
+++ b/src/database/kernels/pad/pad_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -154,6 +158,10 @@ const DatabaseEntry PadSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/pad/pad_3232.hpp b/src/database/kernels/pad/pad_3232.hpp
index cea79c14..cfe5c632 100644
--- a/src/database/kernels/pad/pad_3232.hpp
+++ b/src/database/kernels/pad/pad_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -151,6 +155,10 @@ const DatabaseEntry PadComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 32, 32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/pad/pad_64.hpp b/src/database/kernels/pad/pad_64.hpp
index e85dcb17..b398eca9 100644
--- a/src/database/kernels/pad/pad_64.hpp
+++ b/src/database/kernels/pad/pad_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 8, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,8 +140,12 @@ const DatabaseEntry PadDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 8, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 16, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/pad/pad_6464.hpp b/src/database/kernels/pad/pad_6464.hpp
index 52a4d0e0..b3898574 100644
--- a/src/database/kernels/pad/pad_6464.hpp
+++ b/src/database/kernels/pad/pad_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 8, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 16, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry PadComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 16, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 8, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/padtranspose/padtranspose_16.hpp b/src/database/kernels/padtranspose/padtranspose_16.hpp
index 4a4f619c..13979178 100644
--- a/src/database/kernels/padtranspose/padtranspose_16.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry PadtransposeHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/padtranspose/padtranspose_32.hpp b/src/database/kernels/padtranspose/padtranspose_32.hpp
index f7adc03c..fb09bc16 100644
--- a/src/database/kernels/padtranspose/padtranspose_32.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadtransposeSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -153,6 +157,10 @@ const DatabaseEntry PadtransposeSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/padtranspose/padtranspose_3232.hpp b/src/database/kernels/padtranspose/padtranspose_3232.hpp
index e37e385a..5aaa946d 100644
--- a/src/database/kernels/padtranspose/padtranspose_3232.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry PadtransposeComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -151,6 +155,10 @@ const DatabaseEntry PadtransposeComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/padtranspose/padtranspose_64.hpp b/src/database/kernels/padtranspose/padtranspose_64.hpp
index 77328adb..6ac155d9 100644
--- a/src/database/kernels/padtranspose/padtranspose_64.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadtransposeDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry PadtransposeDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/padtranspose/padtranspose_6464.hpp b/src/database/kernels/padtranspose/padtranspose_6464.hpp
index 8330bffb..8ad7337f 100644
--- a/src/database/kernels/padtranspose/padtranspose_6464.hpp
+++ b/src/database/kernels/padtranspose/padtranspose_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry PadtransposeComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry PadtransposeComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 0, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/transpose/transpose_16.hpp b/src/database/kernels/transpose/transpose_16.hpp
index a83e81cc..be06265c 100644
--- a/src/database/kernels/transpose/transpose_16.hpp
+++ b/src/database/kernels/transpose/transpose_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry TransposeHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 4, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/transpose/transpose_32.hpp b/src/database/kernels/transpose/transpose_32.hpp
index 7e9dec8c..5aca3154 100644
--- a/src/database/kernels/transpose/transpose_32.hpp
+++ b/src/database/kernels/transpose/transpose_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry TransposeSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -153,6 +157,10 @@ const DatabaseEntry TransposeSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 8, 1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/transpose/transpose_3232.hpp b/src/database/kernels/transpose/transpose_3232.hpp
index f4990954..b7341b88 100644
--- a/src/database/kernels/transpose/transpose_3232.hpp
+++ b/src/database/kernels/transpose/transpose_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry TransposeComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -143,6 +147,10 @@ const DatabaseEntry TransposeComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/transpose/transpose_64.hpp b/src/database/kernels/transpose/transpose_64.hpp
index 9f205565..3e73f07b 100644
--- a/src/database/kernels/transpose/transpose_64.hpp
+++ b/src/database/kernels/transpose/transpose_64.hpp
@@ -43,9 +43,13 @@ const DatabaseEntry TransposeDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -136,6 +140,10 @@ const DatabaseEntry TransposeDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 16, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/transpose/transpose_6464.hpp b/src/database/kernels/transpose/transpose_6464.hpp
index 2029c4fb..d16d9534 100644
--- a/src/database/kernels/transpose/transpose_6464.hpp
+++ b/src/database/kernels/transpose/transpose_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry TransposeComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 8, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -127,6 +131,10 @@ const DatabaseEntry TransposeComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 16, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_32.hpp b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
index 7d9b9372..e6184116 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_32.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_32.hpp
@@ -42,6 +42,10 @@ const DatabaseEntry TrsvRoutineSingle = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
index 6ec322fa..e479f235 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_3232.hpp
@@ -41,6 +41,10 @@ const DatabaseEntry TrsvRoutineComplexSingle = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_64.hpp b/src/database/kernels/trsv_routine/trsv_routine_64.hpp
index db65f847..d5c66625 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_64.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_64.hpp
@@ -32,6 +32,10 @@ const DatabaseEntry TrsvRoutineDouble = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/trsv_routine/trsv_routine_6464.hpp b/src/database/kernels/trsv_routine/trsv_routine_6464.hpp
index 4a98b0a7..db52f414 100644
--- a/src/database/kernels/trsv_routine/trsv_routine_6464.hpp
+++ b/src/database/kernels/trsv_routine/trsv_routine_6464.hpp
@@ -32,6 +32,10 @@ const DatabaseEntry TrsvRoutineComplexDouble = {
{ Name{"GeForce GTX 1070 Ti "}, Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xaxpy/xaxpy_16.hpp b/src/database/kernels/xaxpy/xaxpy_16.hpp
index ac243f05..a7548c9f 100644
--- a/src/database/kernels/xaxpy/xaxpy_16.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry XaxpyHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 4, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xaxpy/xaxpy_32.hpp b/src/database/kernels/xaxpy/xaxpy_32.hpp
index 92ea904d..c2eba834 100644
--- a/src/database/kernels/xaxpy/xaxpy_32.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XaxpySingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -154,8 +158,12 @@ const DatabaseEntry XaxpySingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 4, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xaxpy/xaxpy_3232.hpp b/src/database/kernels/xaxpy/xaxpy_3232.hpp
index a1c69b83..8f686b3e 100644
--- a/src/database/kernels/xaxpy/xaxpy_3232.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_3232.hpp
@@ -51,9 +51,13 @@ const DatabaseEntry XaxpyComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
- { kDeviceNameDefault , Params{ 2, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
@@ -151,6 +155,10 @@ const DatabaseEntry XaxpyComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xaxpy/xaxpy_64.hpp b/src/database/kernels/xaxpy/xaxpy_64.hpp
index 5cc36a65..4494cf05 100644
--- a/src/database/kernels/xaxpy/xaxpy_64.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XaxpyDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 64, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry XaxpyDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 2, 512, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 2, 1024, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xaxpy/xaxpy_6464.hpp b/src/database/kernels/xaxpy/xaxpy_6464.hpp
index 07bac80b..aae465cf 100644
--- a/src/database/kernels/xaxpy/xaxpy_6464.hpp
+++ b/src/database/kernels/xaxpy/xaxpy_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XaxpyComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -136,6 +140,10 @@ const DatabaseEntry XaxpyComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 1, 256, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 512, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 128, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xdot/xdot_16.hpp b/src/database/kernels/xdot/xdot_16.hpp
index 4d1b96d3..f78c1606 100644
--- a/src/database/kernels/xdot/xdot_16.hpp
+++ b/src/database/kernels/xdot/xdot_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry XdotHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xdot/xdot_32.hpp b/src/database/kernels/xdot/xdot_32.hpp
index 560e51ca..47c57f12 100644
--- a/src/database/kernels/xdot/xdot_32.hpp
+++ b/src/database/kernels/xdot/xdot_32.hpp
@@ -47,6 +47,10 @@ const DatabaseEntry XdotSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -137,6 +141,10 @@ const DatabaseEntry XdotSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1024, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 512, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xdot/xdot_3232.hpp b/src/database/kernels/xdot/xdot_3232.hpp
index 63d5f75e..da9275bf 100644
--- a/src/database/kernels/xdot/xdot_3232.hpp
+++ b/src/database/kernels/xdot/xdot_3232.hpp
@@ -47,6 +47,10 @@ const DatabaseEntry XdotComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -134,6 +138,10 @@ const DatabaseEntry XdotComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 512, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xdot/xdot_64.hpp b/src/database/kernels/xdot/xdot_64.hpp
index 1a1033ee..240c99fc 100644
--- a/src/database/kernels/xdot/xdot_64.hpp
+++ b/src/database/kernels/xdot/xdot_64.hpp
@@ -39,6 +39,10 @@ const DatabaseEntry XdotDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -120,6 +124,10 @@ const DatabaseEntry XdotDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 256, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xdot/xdot_6464.hpp b/src/database/kernels/xdot/xdot_6464.hpp
index 4804a164..a32844ef 100644
--- a/src/database/kernels/xdot/xdot_6464.hpp
+++ b/src/database/kernels/xdot/xdot_6464.hpp
@@ -39,6 +39,10 @@ const DatabaseEntry XdotComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 256, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -120,6 +124,10 @@ const DatabaseEntry XdotComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 128, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemm/xgemm_32.hpp b/src/database/kernels/xgemm/xgemm_32.hpp
index de4a044c..4805cdb4 100644
--- a/src/database/kernels/xgemm/xgemm_32.hpp
+++ b/src/database/kernels/xgemm/xgemm_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XgemmSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } },
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 16, 128, 8, 8, 128, 0, 0, 1, 1, 8, 8 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } },
+ { kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 16, 16, 64, 1, 1, 0, 0, 2, 4 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 0, 1, 32, 2, 8, 8, 32, 8, 8, 64, 0, 0, 0, 0, 4, 4 } },
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
@@ -154,6 +158,10 @@ const DatabaseEntry XgemmSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 1 } },
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 64, 8, 8, 64, 1, 1, 0, 0, 4, 4 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } },
+ { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 32, 128, 16, 8, 128, 1, 1, 1, 1, 2, 2 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 32, 32, 32, 64, 1, 1, 0, 0, 4, 2 } },
} },
diff --git a/src/database/kernels/xgemm/xgemm_3232.hpp b/src/database/kernels/xgemm/xgemm_3232.hpp
index d55aae56..29ab455c 100644
--- a/src/database/kernels/xgemm/xgemm_3232.hpp
+++ b/src/database/kernels/xgemm/xgemm_3232.hpp
@@ -151,6 +151,10 @@ const DatabaseEntry XgemmComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 8, 8, 32, 1, 1, 0, 0, 2, 4 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } },
+ { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 64, 1, 0, 0, 1, 2, 2 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 32, 32, 64, 1, 1, 0, 0, 2, 1 } },
} },
diff --git a/src/database/kernels/xgemm/xgemm_64.hpp b/src/database/kernels/xgemm/xgemm_64.hpp
index 2bc811d5..2549ea38 100644
--- a/src/database/kernels/xgemm/xgemm_64.hpp
+++ b/src/database/kernels/xgemm/xgemm_64.hpp
@@ -136,6 +136,10 @@ const DatabaseEntry XgemmDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 32, 32, 32, 16, 16, 32, 0, 0, 0, 0, 1, 2 } },
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 32, 16, 16, 64, 0, 0, 0, 0, 2, 4 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } },
+ { kDeviceNameDefault , Params{ 1, 2, 1, 1, 16, 16, 32, 8, 8, 16, 0, 0, 0, 0, 2, 1 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 64, 8, 8, 32, 1, 1, 0, 0, 2, 2 } },
} },
diff --git a/src/database/kernels/xgemm/xgemm_6464.hpp b/src/database/kernels/xgemm/xgemm_6464.hpp
index a3314f93..5f88e319 100644
--- a/src/database/kernels/xgemm/xgemm_6464.hpp
+++ b/src/database/kernels/xgemm/xgemm_6464.hpp
@@ -135,6 +135,10 @@ const DatabaseEntry XgemmComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 0, 1, 32, 2, 16, 16, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 32, 32, 32, 32, 32, 64, 0, 0, 0, 0, 1, 2 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
+ { kDeviceNameDefault , Params{ 0, 1, 32, 2, 8, 8, 16, 16, 16, 16, 0, 0, 0, 0, 1, 1 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 0, 1, 32, 2, 16, 16, 16, 8, 8, 16, 1, 1, 0, 0, 1, 2 } },
} },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
index 0ca997fa..9168ce4d 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_32.hpp
@@ -113,6 +113,10 @@ const DatabaseEntry XgemmDirectSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 32, 8, 8, 16, 1, 1, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 16, 8, 16, 32, 8, 1, 0, 1, 1, 32, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 2, 8, 8, 16, 16, 1, 1, 4, 2, 32, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
index fbd04af8..9b2ed40a 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_3232.hpp
@@ -106,6 +106,10 @@ const DatabaseEntry XgemmDirectComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 16, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
index d589f916..62206ec0 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_64.hpp
@@ -99,6 +99,10 @@ const DatabaseEntry XgemmDirectDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 2, 2, 16, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
index e183985a..da0710be 100644
--- a/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
+++ b/src/database/kernels/xgemm_direct/xgemm_direct_6464.hpp
@@ -99,6 +99,10 @@ const DatabaseEntry XgemmDirectComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 2, 16, 16, 8, 8, 1, 1, 1, 2, 16, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 8, 8, 8, 8, 1, 1, 1, 1, 8, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 2, 16, 16, 8, 8, 1, 1, 1, 1, 16, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv/xgemv_32.hpp b/src/database/kernels/xgemv/xgemv_32.hpp
index fca974ec..7f788521 100644
--- a/src/database/kernels/xgemv/xgemv_32.hpp
+++ b/src/database/kernels/xgemv/xgemv_32.hpp
@@ -152,6 +152,10 @@ const DatabaseEntry XgemvSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv/xgemv_3232.hpp b/src/database/kernels/xgemv/xgemv_3232.hpp
index a09524de..2025867f 100644
--- a/src/database/kernels/xgemv/xgemv_3232.hpp
+++ b/src/database/kernels/xgemv/xgemv_3232.hpp
@@ -145,6 +145,10 @@ const DatabaseEntry XgemvComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv/xgemv_64.hpp b/src/database/kernels/xgemv/xgemv_64.hpp
index b5a86a5f..71a6878f 100644
--- a/src/database/kernels/xgemv/xgemv_64.hpp
+++ b/src/database/kernels/xgemv/xgemv_64.hpp
@@ -127,8 +127,12 @@ const DatabaseEntry XgemvDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv/xgemv_6464.hpp b/src/database/kernels/xgemv/xgemv_6464.hpp
index d05efd2e..b380e440 100644
--- a/src/database/kernels/xgemv/xgemv_6464.hpp
+++ b/src/database/kernels/xgemv/xgemv_6464.hpp
@@ -108,8 +108,12 @@ const DatabaseEntry XgemvComplexDouble = {
{ Name{"GeForce GTX 1080 Ti "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
index 6271eab7..45cc0bed 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_32.hpp
@@ -152,6 +152,10 @@ const DatabaseEntry XgemvFastSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
index bfe475cb..a440f463 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_3232.hpp
@@ -128,6 +128,10 @@ const DatabaseEntry XgemvFastComplexSingle = {
{ Name{"GeForce GTX 1080 Ti "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
index 0383bca3..c5470eaf 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_64.hpp
@@ -127,8 +127,12 @@ const DatabaseEntry XgemvFastDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 1, 256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
index 005eccb3..3b0a21aa 100644
--- a/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
+++ b/src/database/kernels/xgemv_fast/xgemv_fast_6464.hpp
@@ -104,6 +104,10 @@ const DatabaseEntry XgemvFastComplexDouble = {
{ Name{"GeForce GTX 1080 Ti "}, Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 1, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 1, 64, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
index 212e9c2f..7b7b7a27 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_32.hpp
@@ -115,6 +115,10 @@ const DatabaseEntry XgemvFastRotSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 64, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
index 73bf5472..c495b2a8 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_3232.hpp
@@ -96,6 +96,10 @@ const DatabaseEntry XgemvFastRotComplexSingle = {
{ Name{"GeForce GTX 1080 Ti "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 16, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
index d372ad90..6a10f4b6 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_64.hpp
@@ -92,6 +92,10 @@ const DatabaseEntry XgemvFastRotDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 4, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
index 3ad4170e..a7e3acc3 100644
--- a/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
+++ b/src/database/kernels/xgemv_fast_rot/xgemv_fast_rot_6464.hpp
@@ -76,8 +76,12 @@ const DatabaseEntry XgemvFastRotComplexDouble = {
{ Name{"GeForce GTX 1080 Ti "}, Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 32, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 1, 32, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 4, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 2, 32, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_16.hpp b/src/database/kernels/xger/xger_16.hpp
index 8c86df38..8b08a401 100644
--- a/src/database/kernels/xger/xger_16.hpp
+++ b/src/database/kernels/xger/xger_16.hpp
@@ -18,6 +18,10 @@ const DatabaseEntry XgerHalf = {
{ Name{"AMD Radeon RX 480 "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
@@ -52,7 +56,7 @@ const DatabaseEntry XgerHalf = {
{ // Default
kDeviceTypeAll, "default", {
{ "default", {
- { kDeviceNameDefault , Params{ 4, 64, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_32.hpp b/src/database/kernels/xger/xger_32.hpp
index 7455fdba..0b99d911 100644
--- a/src/database/kernels/xger/xger_32.hpp
+++ b/src/database/kernels/xger/xger_32.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XgerSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 16, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -140,8 +144,12 @@ const DatabaseEntry XgerSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 512, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 8, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
- { kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
}
},
diff --git a/src/database/kernels/xger/xger_3232.hpp b/src/database/kernels/xger/xger_3232.hpp
index af0431a7..7f547e1e 100644
--- a/src/database/kernels/xger/xger_3232.hpp
+++ b/src/database/kernels/xger/xger_3232.hpp
@@ -51,6 +51,10 @@ const DatabaseEntry XgerComplexSingle = {
{ Name{"ATI Radeon HD 6750M "}, Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 128, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 4, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 64, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -137,6 +141,10 @@ const DatabaseEntry XgerComplexSingle = {
{ Name{"TITAN X (Pascal) "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 8, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 32, 8, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xger/xger_64.hpp b/src/database/kernels/xger/xger_64.hpp
index f531dd5a..857efaa3 100644
--- a/src/database/kernels/xger/xger_64.hpp
+++ b/src/database/kernels/xger/xger_64.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XgerDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 8, 16, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 32, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 128, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -123,6 +127,10 @@ const DatabaseEntry XgerDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 32, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 512, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 4, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 64, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/database/kernels/xger/xger_6464.hpp b/src/database/kernels/xger/xger_6464.hpp
index b58867f1..bfeb114e 100644
--- a/src/database/kernels/xger/xger_6464.hpp
+++ b/src/database/kernels/xger/xger_6464.hpp
@@ -43,6 +43,10 @@ const DatabaseEntry XgerComplexDouble = {
{ Name{"AMD Radeon R9 380 "}, Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 16, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "Vega", {
+ { Name{"Radeon RX Vega "}, Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 64, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ Name{"AMD Radeon Pro 580 Compute Engine "}, Params{ 64, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 32, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@@ -123,6 +127,10 @@ const DatabaseEntry XgerComplexDouble = {
{ Name{"TITAN X (Pascal) "}, Params{ 4, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
{ kDeviceNameDefault , Params{ 256, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
+ { "SM7.5", {
+ { Name{"TITAN RTX "}, Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ { kDeviceNameDefault , Params{ 32, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+ } },
{ "default", {
{ kDeviceNameDefault , Params{ 128, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
} },
diff --git a/src/kernels/level1/xamax.opencl b/src/kernels/level1/xamax.opencl
index e65acf17..85cbdc86 100644
--- a/src/kernels/level1/xamax.opencl
+++ b/src/kernels/level1/xamax.opencl
@@ -64,7 +64,7 @@ void Xamax(const int n,
#else
x = fabs(x);
#endif
- if (x >= max) {
+ if (x > max) {
max = x;
imax = id*x_inc + x_offset;
}
@@ -77,7 +77,7 @@ void Xamax(const int n,
// Performs reduction in local memory
for (int s=WGS1/2; s>0; s=s>>1) {
if (lid < s) {
- if (maxlm[lid + s] >= maxlm[lid]) {
+ if (maxlm[lid + s] > maxlm[lid]) {
maxlm[lid] = maxlm[lid + s];
imaxlm[lid] = imaxlm[lid + s];
}
@@ -105,7 +105,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
const int lid = get_local_id(0);
// Performs the first step of the reduction while loading the data
- if (maxgm[lid + WGS2] >= maxgm[lid]) {
+ if (maxgm[lid + WGS2] > maxgm[lid]) {
maxlm[lid] = maxgm[lid + WGS2];
imaxlm[lid] = imaxgm[lid + WGS2];
}
@@ -118,7 +118,7 @@ void XamaxEpilogue(const __global singlereal* restrict maxgm,
// Performs reduction in local memory
for (int s=WGS2/2; s>0; s=s>>1) {
if (lid < s) {
- if (maxlm[lid + s] >= maxlm[lid]) {
+ if (maxlm[lid + s] > maxlm[lid]) {
maxlm[lid] = maxlm[lid + s];
imaxlm[lid] = imaxlm[lid + s];
}
diff --git a/src/pyclblast/samples/saxpybatched.py b/src/pyclblast/samples/saxpybatched.py
new file mode 100644
index 00000000..fa523945
--- /dev/null
+++ b/src/pyclblast/samples/saxpybatched.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# This file is part of the CLBlast project. The project is licensed under Apache Version 2.0.
+# This file follows the PEP8 Python style guide and uses a max-width of 100 characters per line.
+#
+# Author(s):
+# Cedric Nugteren <www.cedricnugteren.nl>
+
+import numpy as np
+import pyopencl as cl
+from pyopencl.array import Array
+import pyclblast
+
+# Settings for this sample:
+batch_count = 2
+dtype = 'float32'
+alphas = [1.5, 1.0]
+n = 4
+
+print("# Setting up OpenCL")
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+print("# Setting up Numpy arrays")
+x = np.random.rand(n * batch_count).astype(dtype=dtype)
+y = np.random.rand(n * batch_count).astype(dtype=dtype)
+
+print("# Batch offsets: next after each other")
+x_offsets = [0, n]
+y_offsets = [0, n]
+
+print("# Setting up OpenCL arrays")
+clx = Array(queue, x.shape, x.dtype)
+cly = Array(queue, y.shape, y.dtype)
+clx.set(x)
+cly.set(y)
+
+print("# Example level-1 batched operation: AXPY-batched")
+assert len(alphas) == len(x_offsets) == len(y_offsets) == batch_count
+pyclblast.axpyBatched(queue, n, clx, cly, alphas, x_offsets, y_offsets)
+queue.finish()
+
+print("# Full result for vector y: %s" % str(cly.get()))
+for i in range(batch_count):
+ result = alphas[i] * x[x_offsets[i]:x_offsets[i] + n] + y[y_offsets[i]:y_offsets[i] + n]
+ print("# Expected result batch #%d: %s" % (i, str(result)))
diff --git a/src/pyclblast/setup.py b/src/pyclblast/setup.py
index 1c1bf3ab..bcc966ed 100644
--- a/src/pyclblast/setup.py
+++ b/src/pyclblast/setup.py
@@ -22,7 +22,7 @@ ext_modules.append(
setup(
name="pyclblast",
- version="1.2.0",
+ version="1.3.0",
author="Cedric Nugteren",
author_email="web@cedricnugteren.nl",
url="https://github.com/CNugteren/CLBlast/blob/master/src/pyclblast",
diff --git a/src/pyclblast/src/pyclblast.pyx b/src/pyclblast/src/pyclblast.pyx
index 14efcf8a..eb46649f 100644
--- a/src/pyclblast/src/pyclblast.pyx
+++ b/src/pyclblast/src/pyclblast.pyx
@@ -364,6 +364,7 @@ def swap(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0):
err = CLBlastHswap(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXswap' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -405,6 +406,7 @@ def scal(queue, n, x, x_inc = 1, alpha = 1.0, x_offset = 0):
err = CLBlastHscal(n, <cl_half>alpha, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXscal' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -448,6 +450,7 @@ def copy(queue, n, x, y, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0):
err = CLBlastHcopy(n, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXcopy' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -491,6 +494,7 @@ def axpy(queue, n, x, y, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset = 0, y_offs
err = CLBlastHaxpy(n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXaxpy' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -530,6 +534,7 @@ def dot(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0, d
err = CLBlastHdot(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXdot' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -566,6 +571,7 @@ def dotu(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0,
err = CLBlastZdotu(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXdotu' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -602,6 +608,7 @@ def dotc(queue, n, x, y, dot, x_inc = 1, y_inc = 1, x_offset = 0, y_offset = 0,
err = CLBlastZdotc(n, dot_buffer, dot_offset, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXdotc' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -645,6 +652,7 @@ def nrm2(queue, n, x, nrm2, x_inc = 1, x_offset = 0, nrm2_offset = 0):
err = CLBlastHnrm2(n, nrm2_buffer, nrm2_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXnrm2' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -688,6 +696,7 @@ def asum(queue, n, x, asum, x_inc = 1, x_offset = 0, asum_offset = 0):
err = CLBlastHasum(n, asum_buffer, asum_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXasum' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -731,6 +740,7 @@ def sum(queue, n, x, sum, x_inc = 1, x_offset = 0, sum_offset = 0):
err = CLBlastHsum(n, sum_buffer, sum_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsum' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -774,6 +784,7 @@ def amax(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0):
err = CLBlastiHamax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXamax' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -817,6 +828,7 @@ def amin(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0):
err = CLBlastiHamin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXamin' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -860,6 +872,7 @@ def max(queue, n, x, imax, x_inc = 1, x_offset = 0, imax_offset = 0):
err = CLBlastiHmax(n, imax_buffer, imax_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXmax' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -903,6 +916,7 @@ def min(queue, n, x, imin, x_inc = 1, x_offset = 0, imin_offset = 0):
err = CLBlastiHmin(n, imin_buffer, imin_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXmin' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -949,6 +963,7 @@ def gemv(queue, m, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0
err = CLBlastHgemv(CLBlastLayoutRowMajor, a_transpose, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXgemv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -995,6 +1010,7 @@ def gbmv(queue, m, n, kl, ku, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0,
err = CLBlastHgbmv(CLBlastLayoutRowMajor, a_transpose, m, n, kl, ku, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXgbmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1032,6 +1048,7 @@ def hemv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0,
err = CLBlastZhemv(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXhemv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1069,6 +1086,7 @@ def hbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0
err = CLBlastZhbmv(CLBlastLayoutRowMajor, triangle, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXhbmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1106,6 +1124,7 @@ def hpmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.
err = CLBlastZhpmv(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), ap_buffer, ap_offset, x_buffer, x_offset, x_inc, <cl_double2>cl_double2(x=beta.real,y=beta.imag), y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXhpmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1146,6 +1165,7 @@ def symv(queue, n, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.0,
err = CLBlastHsymv(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsymv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1186,6 +1206,7 @@ def sbmv(queue, n, k, a, x, y, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0
err = CLBlastHsbmv(CLBlastLayoutRowMajor, triangle, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsbmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1226,6 +1247,7 @@ def spmv(queue, n, ap, x, y, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, beta = 0.
err = CLBlastHspmv(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, <cl_half>beta, y_buffer, y_offset, y_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXspmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1272,6 +1294,7 @@ def trmv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = Fal
err = CLBlastHtrmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXtrmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1318,6 +1341,7 @@ def tbmv(queue, n, k, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp =
err = CLBlastHtbmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, k, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXtbmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1364,6 +1388,7 @@ def tpmv(queue, n, ap, x, ap_ld, x_inc = 1, lower_triangle = False, a_transp = F
err = CLBlastHtpmv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, ap_buffer, ap_offset, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXtpmv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1407,6 +1432,7 @@ def trsv(queue, n, a, x, a_ld, x_inc = 1, lower_triangle = False, a_transp = Fal
err = CLBlastZtrsv(CLBlastLayoutRowMajor, triangle, a_transpose, diagonal, n, a_buffer, a_offset, a_ld, x_buffer, x_offset, x_inc, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXtrsv' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1446,6 +1472,7 @@ def ger(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset
err = CLBlastHger(CLBlastLayoutRowMajor, m, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXger' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1482,6 +1509,7 @@ def geru(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset
err = CLBlastZgeru(CLBlastLayoutRowMajor, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXgeru' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1518,6 +1546,7 @@ def gerc(queue, m, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, x_offset
err = CLBlastZgerc(CLBlastLayoutRowMajor, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXgerc' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1553,6 +1582,7 @@ def her(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_
err = CLBlastZher(CLBlastLayoutRowMajor, triangle, n, <cl_double>alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXher' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1588,6 +1618,7 @@ def hpr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False,
err = CLBlastZhpr(CLBlastLayoutRowMajor, triangle, n, <cl_double>alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXhpr' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1625,6 +1656,7 @@ def her2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_trian
err = CLBlastZher2(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXher2' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1662,6 +1694,7 @@ def hpr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_tri
err = CLBlastZhpr2(CLBlastLayoutRowMajor, triangle, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXhpr2' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1700,6 +1733,7 @@ def syr(queue, n, x, a, a_ld, x_inc = 1, alpha = 1.0, lower_triangle = False, x_
err = CLBlastHsyr(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsyr' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1738,6 +1772,7 @@ def spr(queue, n, x, ap, ap_ld, x_inc = 1, alpha = 1.0, lower_triangle = False,
err = CLBlastHspr(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, ap_buffer, ap_offset, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXspr' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1778,6 +1813,7 @@ def syr2(queue, n, x, y, a, a_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_trian
err = CLBlastHsyr2(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, a_buffer, a_offset, a_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsyr2' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1818,6 +1854,7 @@ def spr2(queue, n, x, y, ap, ap_ld, x_inc = 1, y_inc = 1, alpha = 1.0, lower_tri
err = CLBlastHspr2(CLBlastLayoutRowMajor, triangle, n, <cl_half>alpha, x_buffer, x_offset, x_inc, y_buffer, y_offset, y_inc, ap_buffer, ap_offset, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXspr2' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1865,6 +1902,7 @@ def gemm(queue, m, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, a_t
err = CLBlastHgemm(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXgemm' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1912,6 +1950,7 @@ def symm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_
err = CLBlastHsymm(CLBlastLayoutRowMajor, side, triangle, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsymm' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1950,6 +1989,7 @@ def hemm(queue, m, n, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, right_
err = CLBlastZhemm(CLBlastLayoutRowMajor, side, triangle, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_double2>cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXhemm' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -1995,6 +2035,7 @@ def syrk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle
err = CLBlastHsyrk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsyrk' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -2031,6 +2072,7 @@ def herk(queue, n, k, a, c, a_ld, c_ld, alpha = 1.0, beta = 0.0, lower_triangle
err = CLBlastZherk(CLBlastLayoutRowMajor, triangle, a_transpose, n, k, <cl_double>alpha, a_buffer, a_offset, a_ld, <cl_double>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXherk' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -2078,6 +2120,7 @@ def syr2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower
err = CLBlastHsyr2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_half>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXsyr2k' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -2116,6 +2159,7 @@ def her2k(queue, n, k, a, b, c, a_ld, b_ld, c_ld, alpha = 1.0, beta = 0.0, lower
err = CLBlastZher2k(CLBlastLayoutRowMajor, triangle, ab_transpose, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, <cl_double>beta, c_buffer, c_offset, c_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXher2k' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -2163,6 +2207,7 @@ def trmm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_t
err = CLBlastHtrmm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, <cl_half>alpha, a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXtrmm' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
@@ -2207,11 +2252,223 @@ def trsm(queue, m, n, a, b, a_ld, b_ld, alpha = 1.0, right_side = False, lower_t
err = CLBlastZtrsm(CLBlastLayoutRowMajor, side, triangle, a_transpose, diagonal, m, n, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, &command_queue, &event)
else:
raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
if err != CLBlastSuccess:
raise RuntimeError("PyCLBlast: 'CLBlastXtrsm' failed: %s" % get_status_message(err))
return cl.Event.from_int_ptr(<size_t>event)
####################################################################################################
+# Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED
+####################################################################################################
+
+cdef extern from "clblast_c.h":
+ CLBlastStatusCode CLBlastSaxpyBatched(const size_t n, const float *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastDaxpyBatched(const size_t n, const double *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastCaxpyBatched(const size_t n, const cl_float2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastZaxpyBatched(const size_t n, const cl_double2 *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastHaxpyBatched(const size_t n, const cl_half *alphas, const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc, cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+
+def axpyBatched(queue, n, x, y, alphas, x_offsets, y_offsets, x_inc = 1, y_inc = 1):
+ """
+ xAXPYBATCHED: Batched version of AXPY
+ """
+
+ dtype = check_dtype([x, y], ["float32", "float64", "complex64", "complex128", "float16"])
+ check_vector(x, "x")
+ check_vector(y, "y")
+
+ if len(x_offsets) != len(y_offsets) != len(alphas):
+ raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: length of batch-sized arguments x_offsets, y_offsets, alphas should be equal")
+ batch_count = len(x_offsets)
+
+ cdef size_t *x_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+ for i in range(batch_count):
+ x_offsets_c[i] = x_offsets[i]
+ cdef size_t *y_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+ for i in range(batch_count):
+ y_offsets_c[i] = y_offsets[i]
+ cdef void *alphas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))
+ for i in range(batch_count):
+ if dtype == np.dtype("float32"):
+ (<cl_float*>alphas_c)[i] = <cl_float>alphas[i]
+ elif dtype == np.dtype("float64"):
+ (<cl_double*>alphas_c)[i] = <cl_double>alphas[i]
+ elif dtype == np.dtype("complex64"):
+ (<cl_float2*>alphas_c)[i] = <cl_float2>cl_float2(x=alphas[i].real,y=alphas[i].imag)
+ elif dtype == np.dtype("complex128"):
+ (<cl_double2*>alphas_c)[i] = <cl_double2>cl_double2(x=alphas[i].real,y=alphas[i].imag)
+ elif dtype == np.dtype("float16"):
+ (<cl_half*>alphas_c)[i] = <cl_half>alphas[i]
+
+ cdef cl_mem x_buffer = <cl_mem><size_t>x.base_data.int_ptr
+ cdef cl_mem y_buffer = <cl_mem><size_t>y.base_data.int_ptr
+
+ cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr
+ cdef cl_event event = NULL
+
+ cdef CLBlastStatusCode err
+ if dtype == np.dtype("float32"):
+ err = CLBlastSaxpyBatched(n, <cl_float*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("float64"):
+ err = CLBlastDaxpyBatched(n, <cl_double*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("complex64"):
+ err = CLBlastCaxpyBatched(n, <cl_float2*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("complex128"):
+ err = CLBlastZaxpyBatched(n, <cl_double2*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("float16"):
+ err = CLBlastHaxpyBatched(n, <cl_half*>alphas_c, x_buffer, x_offsets_c, x_inc, y_buffer, y_offsets_c, y_inc, batch_count, &command_queue, &event)
+ else:
+ raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
+ PyMem_Free(x_offsets_c)
+ PyMem_Free(y_offsets_c)
+ PyMem_Free(alphas_c)
+
+ if err != CLBlastSuccess:
+ raise RuntimeError("PyCLBlast: 'CLBlastXaxpyBatched' failed: %s" % get_status_message(err))
+ return cl.Event.from_int_ptr(<size_t>event)
+
+####################################################################################################
+# Batched version of GEMM: SGEMMBATCHED/DGEMMBATCHED/CGEMMBATCHED/ZGEMMBATCHED/HGEMMBATCHED
+####################################################################################################
+
+cdef extern from "clblast_c.h":
+ CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const float *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const double *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_float2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_double2 *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half *alphas, const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld, const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld, const cl_half *betas, cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+
+def gemmBatched(queue, m, n, k, a, b, c, alphas, betas, a_ld, b_ld, c_ld, a_offsets, b_offsets, c_offsets, a_transp = False, b_transp = False):
+ """
+ xGEMMBATCHED: Batched version of GEMM
+ """
+
+ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"])
+ check_matrix(a, "a")
+ check_matrix(b, "b")
+ check_matrix(c, "c")
+
+ if len(a_offsets) != len(b_offsets) != len(c_offsets) != len(alphas) != len(betas):
+ raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: length of batch-sized arguments a_offsets, b_offsets, c_offsets, alphas, betas should be equal")
+ batch_count = len(a_offsets)
+
+ cdef size_t *a_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+ for i in range(batch_count):
+ a_offsets_c[i] = a_offsets[i]
+ cdef size_t *b_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+ for i in range(batch_count):
+ b_offsets_c[i] = b_offsets[i]
+ cdef size_t *c_offsets_c = <size_t *> PyMem_Malloc(batch_count * sizeof(size_t))
+ for i in range(batch_count):
+ c_offsets_c[i] = c_offsets[i]
+ cdef void *alphas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))
+ for i in range(batch_count):
+ if dtype == np.dtype("float32"):
+ (<cl_float*>alphas_c)[i] = <cl_float>alphas[i]
+ elif dtype == np.dtype("float64"):
+ (<cl_double*>alphas_c)[i] = <cl_double>alphas[i]
+ elif dtype == np.dtype("complex64"):
+ (<cl_float2*>alphas_c)[i] = <cl_float2>cl_float2(x=alphas[i].real,y=alphas[i].imag)
+ elif dtype == np.dtype("complex128"):
+ (<cl_double2*>alphas_c)[i] = <cl_double2>cl_double2(x=alphas[i].real,y=alphas[i].imag)
+ elif dtype == np.dtype("float16"):
+ (<cl_half*>alphas_c)[i] = <cl_half>alphas[i]
+ cdef void *betas_c = <void *> PyMem_Malloc(batch_count * sizeof(dtype_size[dtype]))
+ for i in range(batch_count):
+ if dtype == np.dtype("float32"):
+ (<cl_float*>betas_c)[i] = <cl_float>betas[i]
+ elif dtype == np.dtype("float64"):
+ (<cl_double*>betas_c)[i] = <cl_double>betas[i]
+ elif dtype == np.dtype("complex64"):
+ (<cl_float2*>betas_c)[i] = <cl_float2>cl_float2(x=betas[i].real,y=betas[i].imag)
+ elif dtype == np.dtype("complex128"):
+ (<cl_double2*>betas_c)[i] = <cl_double2>cl_double2(x=betas[i].real,y=betas[i].imag)
+ elif dtype == np.dtype("float16"):
+ (<cl_half*>betas_c)[i] = <cl_half>betas[i]
+
+ cdef cl_mem a_buffer = <cl_mem><size_t>a.base_data.int_ptr
+ cdef cl_mem b_buffer = <cl_mem><size_t>b.base_data.int_ptr
+ cdef cl_mem c_buffer = <cl_mem><size_t>c.base_data.int_ptr
+
+ cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr
+ cdef cl_event event = NULL
+ a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo
+ b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo
+
+ cdef CLBlastStatusCode err
+ if dtype == np.dtype("float32"):
+ err = CLBlastSgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_float*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("float64"):
+ err = CLBlastDgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_double*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("complex64"):
+ err = CLBlastCgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float2*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_float2*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("complex128"):
+ err = CLBlastZgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double2*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_double2*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("float16"):
+ err = CLBlastHgemmBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half*>alphas_c, a_buffer, a_offsets_c, a_ld, b_buffer, b_offsets_c, b_ld, <cl_half*>betas_c, c_buffer, c_offsets_c, c_ld, batch_count, &command_queue, &event)
+ else:
+ raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
+ PyMem_Free(a_offsets_c)
+ PyMem_Free(b_offsets_c)
+ PyMem_Free(c_offsets_c)
+ PyMem_Free(alphas_c)
+ PyMem_Free(betas_c)
+
+ if err != CLBlastSuccess:
+ raise RuntimeError("PyCLBlast: 'CLBlastXgemmBatched' failed: %s" % get_status_message(err))
+ return cl.Event.from_int_ptr(<size_t>event)
+
+####################################################################################################
+# StridedBatched version of GEMM: SGEMMSTRIDEDBATCHED/DGEMMSTRIDEDBATCHED/CGEMMSTRIDEDBATCHED/ZGEMMSTRIDEDBATCHED/HGEMMSTRIDEDBATCHED
+####################################################################################################
+
+cdef extern from "clblast_c.h":
+ CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const float beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const double beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_float2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_float2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_double2 alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_double2 beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+ CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose, const size_t m, const size_t n, const size_t k, const cl_half alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride, const cl_half beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride, const size_t batch_count,cl_command_queue* queue, cl_event* event)
+
+def gemmStridedBatched(queue, m, n, k, batch_count, a, b, c, a_ld, b_ld, c_ld, a_stride, b_stride, c_stride, alpha = 1.0, beta = 0.0, a_transp = False, b_transp = False, a_offset = 0, b_offset = 0, c_offset = 0):
+ """
+ xGEMMSTRIDEDBATCHED: StridedBatched version of GEMM
+ """
+
+ dtype = check_dtype([a, b, c], ["float32", "float64", "complex64", "complex128", "float16"])
+ check_matrix(a, "a")
+ check_matrix(b, "b")
+ check_matrix(c, "c")
+
+ cdef cl_mem a_buffer = <cl_mem><size_t>a.base_data.int_ptr
+ cdef cl_mem b_buffer = <cl_mem><size_t>b.base_data.int_ptr
+ cdef cl_mem c_buffer = <cl_mem><size_t>c.base_data.int_ptr
+
+ cdef cl_command_queue command_queue = <cl_command_queue><size_t>queue.int_ptr
+ cdef cl_event event = NULL
+ a_transpose = CLBlastTransposeYes if a_transp else CLBlastTransposeNo
+ b_transpose = CLBlastTransposeYes if b_transp else CLBlastTransposeNo
+
+ cdef CLBlastStatusCode err
+ if dtype == np.dtype("float32"):
+ err = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_float>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("float64"):
+ err = CLBlastDgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_double>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("complex64"):
+ err = CLBlastCgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_float2>cl_float2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_float2>cl_float2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("complex128"):
+ err = CLBlastZgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_double2>cl_double2(x=alpha.real,y=alpha.imag), a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_double2>cl_double2(x=beta.real,y=beta.imag), c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+ elif dtype == np.dtype("float16"):
+ err = CLBlastHgemmStridedBatched(CLBlastLayoutRowMajor, a_transpose, b_transpose, m, n, k, <cl_half>alpha, a_buffer, a_offset, a_ld, a_stride, b_buffer, b_offset, b_ld, b_stride, <cl_half>beta, c_buffer, c_offset, c_ld, c_stride, batch_count, &command_queue, &event)
+ else:
+ raise ValueError("PyCLBlast: Unrecognized data-type '%s'" % dtype)
+
+ if err != CLBlastSuccess:
+ raise RuntimeError("PyCLBlast: 'CLBlastXgemmStridedBatched' failed: %s" % get_status_message(err))
+ return cl.Event.from_int_ptr(<size_t>event)
+
+####################################################################################################
# Overrides the parameters
####################################################################################################
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index c5eee527..be6b233e 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -82,7 +82,7 @@ void PrintTimingsToFileAsJSON(const std::string &filename,
void print_separator(const size_t parameters_size) {
printf("x------x-------x");
for (auto i = size_t{0}; i < parameters_size; ++i) { printf("-----"); }
- printf("-x----------------x--------------x--------x-------------------x\n");
+ printf("-x-----------------x-----------------x----------------x--------------x--------x-------------------x\n");
}
// =================================================================================================
@@ -145,6 +145,7 @@ void Tuner(int argc, char* argv[], const int V,
const auto platform = Platform(args.platform_id);
const auto device = Device(platform, args.device_id);
const auto context = Context(device);
+ auto queue = Queue(context, device);
// Tests for validity of the precision and retrieves properties
if (!PrecisionSupported<T>(device)) {
@@ -203,12 +204,11 @@ void Tuner(int argc, char* argv[], const int V,
printf("\n");
printf("| ID | total |");
for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); }
- printf("param | compiles | time | %6s | status |\n", settings.performance_unit.c_str());
+ printf("param | local | global | compiles | time | %6s | status |\n", settings.performance_unit.c_str());
print_separator(settings.parameters.size());
// First runs a reference example to compare against
try {
- auto queue = Queue(context, device);
printf("| ref | - |");
for (auto i = size_t{0}; i < settings.parameters.size() - 1; ++i) { printf(" "); }
printf(" - |");
@@ -219,6 +219,16 @@ void Tuner(int argc, char* argv[], const int V,
device_buffers[id].Write(queue, buffer_sizes[id], source_buffers[id]);
}
+ // Sets the thread configuration
+ auto global = settings.global_size_ref;
+ auto local = settings.local_size_ref;
+
+ // Make sure that the global worksize is a multiple of the local
+ for (auto i=size_t{0}; i<global.size(); ++i) {
+ while ((global[i] / local[i]) * local[i] != global[i]) { global[i]++; }
+ }
+ printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]);
+
// Compiles the kernel
auto compiler_options = std::vector<std::string>();
const auto program = CompileFromSource(settings.sources, args.precision, settings.kernel_name,
@@ -229,7 +239,7 @@ void Tuner(int argc, char* argv[], const int V,
// Runs the kernel
const auto time_ms = TimeKernel(args.num_runs, kernel, queue, device,
- settings.global_size_ref, settings.local_size_ref);
+ global, local);
printf(" - |");
if (time_ms == -1.0) { throw std::runtime_error("Error in reference implementation"); }
@@ -251,8 +261,6 @@ void Tuner(int argc, char* argv[], const int V,
auto results = std::vector<TuningResult>();
for (auto config_id = size_t{0}; config_id < configurations.size(); ++config_id) {
try {
- auto queue = Queue(context, device);
-
auto configuration = configurations[config_id];
printf("| %4zu | %5zu |", config_id + 1, configurations.size());
for (const auto& parameter : settings.parameters) {
@@ -266,10 +274,16 @@ void Tuner(int argc, char* argv[], const int V,
}
// Sets the thread configuration
- const auto global = SetThreadConfiguration(configuration, settings.global_size,
- settings.mul_global, settings.div_global);
- const auto local = SetThreadConfiguration(configuration, settings.local_size,
- settings.mul_local, settings.div_local);
+ auto global = SetThreadConfiguration(configuration, settings.global_size,
+ settings.mul_global, settings.div_global);
+ auto local = SetThreadConfiguration(configuration, settings.local_size,
+ settings.mul_local, settings.div_local);
+
+ // Make sure that the global worksize is a multiple of the local
+ for (auto i=size_t{0}; i<global.size(); ++i) {
+ while ((global[i] / local[i]) * local[i] != global[i]) { global[i]++; }
+ }
+ printf("%8zu%8zu |%8zu%8zu |", local[0], local[1], global[0], global[1]);
// Sets the parameters for this configuration
auto kernel_source = std::string{""};