summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--src/kernels/level3/invert_diagonal_blocks.opencl50
-rw-r--r--src/tuning/kernels/invert.cpp125
3 files changed, 160 insertions, 17 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b7c1238..9f7264f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,7 +193,7 @@ endif()
# Sets the supported routines and the used kernels. New routines and kernels should be added here.
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
- xgemm xgemm_direct xgemv)
+ xgemm xgemm_direct xgemv invert)
set(DATABASES copy pad padtranspose transpose xaxpy xdot
xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger
gemm_routine)
diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl
index db1513c1..e8f0ea91 100644
--- a/src/kernels/level3/invert_diagonal_blocks.opencl
+++ b/src/kernels/level3/invert_diagonal_blocks.opencl
@@ -56,8 +56,26 @@ R"(
// =================================================================================================
#if defined(ROUTINE_INVERT)
-#define LOCALX 17 // 16 + 1 to avoid bank conflicts
-#define LOCALY 16
+// Parameters set by the tuner
+// TODO: Make these actually tunable
+#ifndef INTERNAL_BLOCK_SIZE
+ #define INTERNAL_BLOCK_SIZE 16 // Internal block size of the invert kernel
+#endif
+#ifndef LOCALPAD
+ #define LOCALPAD 0 // Padding in the x-dimension of the local memory to avoid bank conflicts
+#endif
+#ifndef LOCALX
+ #define LOCALX (16 + LOCALPAD) // Local memory size in x-dimension of TripleMatMul kernels
+#endif
+#ifndef LOCALY
+ #define LOCALY 16 // Local memory size in y-dimension of TripleMatMul kernels
+#endif
+#ifndef TMMWGSX
+ #define TMMWGSX 4 // Work-group size in x-dimension of TripleMatMul kernels
+#endif
+#ifndef TMMWGSY
+ #define TMMWGSY 4 // Work-group size in y-dimension of TripleMatMul kernels
+#endif
// =================================================================================================
@@ -172,7 +190,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
const int page = get_group_id(1) % num_pages;
const int lidx = get_local_id(0);
const int lidy = get_local_id(1);
- const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1));
+ const int ibx = get_group_id(0) * (get_local_size(0) * TMMWGSY);
const int iby = by*16;
const int id = lidx + lidy*get_local_size(0);
const int row = page*current_size*2 + current_size + ibx + id;
@@ -195,7 +213,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part,
// Loads a 16 x 16 block of B into local memory using NX x 4 threads
for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0)
- for (int _j = 0; _j < 16; _j += 4 ) { // += get_local_size(1)
+ for (int _j = 0; _j < 16; _j += TMMWGSY ) { // += get_local_size(1)
blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb];
}
}
@@ -321,7 +339,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r
// =================================================================================================
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -330,7 +348,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -338,7 +356,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -347,7 +365,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -355,7 +373,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s
}
// B21 = A21 * B11
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -364,7 +382,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in
}
// B21 = -B22 * B21
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -374,7 +392,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s
// =================================================================================================
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -383,7 +401,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(4, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -391,7 +409,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -400,7 +418,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(8, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
@@ -408,7 +426,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s
}
// B12 = A12 * B22
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda,
__global real* restrict dest, int current_size, int num_pages, const int block_size)
{
@@ -417,7 +435,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in
}
// B12 = -B11 * B12
-__kernel __attribute__((reqd_work_group_size(16, 4, 1)))
+__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1)))
void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size)
{
__local real lm[LOCALY * LOCALX];
diff --git a/src/tuning/kernels/invert.cpp b/src/tuning/kernels/invert.cpp
new file mode 100644
index 00000000..cce2fc8d
--- /dev/null
+++ b/src/tuning/kernels/invert.cpp
@@ -0,0 +1,125 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file uses the auto-tuner to tune the invert OpenCL kernels.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+
+#include "utilities/utilities.hpp"
+#include "tuning/tuning.hpp"
+
+namespace clblast {
+// =================================================================================================
+
+// Settings for this kernel (default command-line arguments)
+TunerDefaults GetTunerDefaults(const int) {
+ auto settings = TunerDefaults();
+ settings.options = {kArgN, kArgM, kArgK};
+ settings.default_n = 128; // dimension of input matrix
+ settings.default_m = 64; // block size
+ settings.default_k = 16; // current size
+ return settings;
+}
+
+// Settings for this kernel (general)
+template <typename T>
+TunerSettings GetTunerSettings(const int, const Arguments<T> &args) {
+ auto settings = TunerSettings();
+
+ // Identification of the kernel
+ settings.kernel_family = "invert";
+ settings.kernel_name = "TripleMatMul16Part1Lower";
+ settings.sources =
+"#define ROUTINE_INVERT"
+#include "../src/kernels/level3/invert_diagonal_blocks.opencl"
+ ;
+
+ // Buffer sizes
+ settings.size_a = args.n * args.a_ld + args.a_offset;
+ settings.size_b = CeilDiv(args.n, args.m) * args.m * args.m;
+
+ // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5)
+ settings.inputs = {2, 3};
+ settings.outputs = {3};
+
+ // Sets the base thread configuration
+ const auto num_pages = CeilDiv(args.n, args.m*2);
+ settings.global_size = {args.k / 4, num_pages * (args.k / 16) * 4};
+ settings.global_size_ref = settings.global_size;
+ settings.local_size = {1, 1};
+ settings.local_size_ref = {4, 4};
+
+ // Transforms the thread configuration based on the parameters
+ settings.mul_local = {{"TMMWGSX", "TMMWGSY"}};
+ settings.div_global = {{}};
+
+ // Sets the tuning parameters and their possible values
+ // TODO: Make these actually tunable, apart from LOCALPAD
+ settings.parameters = {
+ {"INTERNAL_BLOCK_SIZE", {16}},
+ {"LOCALPAD", {0, 1}},
+ {"TMMWGSX", {4}},
+ {"TMMWGSY", {4}},
+ };
+
+ // Describes how to compute the performance metrics
+ settings.metric_amount = 1 * GetBytes(args.precision);
+ settings.performance_unit = "N/A";
+
+ return settings;
+}
+
+// Tests for valid arguments
+template <typename T>
+void TestValidArguments(const int, const Arguments<T> &args) {
+ if (!(args.k == 16)) {
+ throw std::runtime_error("'TripleMatMul16Part1Lower' requires 'k' to be 16");
+ }
+}
+std::vector<Constraint> SetConstraints(const int) { return {}; }
+
+// Sets the kernel's arguments
+template <typename T>
+void SetArguments(const int, Kernel &kernel, const Arguments<T> &args, std::vector<Buffer<T>>& buffers) {
+ const auto num_pages = CeilDiv(args.n, args.m*2);
+ kernel.SetArgument(0, static_cast<int>(args.n));
+ kernel.SetArgument(1, buffers[0]()); // 0 == A matrix
+ kernel.SetArgument(2, 0); // a_offset
+ kernel.SetArgument(3, static_cast<int>(args.n)); // a_ld
+ kernel.SetArgument(4, buffers[1]()); // 1 == B matrix
+ kernel.SetArgument(5, static_cast<int>(args.k)); // current_size
+ kernel.SetArgument(6, static_cast<int>(num_pages)); // num_pages
+ kernel.SetArgument(7, static_cast<int>(args.m)); // block_size
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Shortcuts to the clblast namespace
+using half = clblast::half;
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+ const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv);
+ switch(clblast::GetPrecision(command_line_args)) {
+ case clblast::Precision::kHalf: clblast::Tuner<half>(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings<half>, clblast::TestValidArguments<half>, clblast::SetConstraints, clblast::SetArguments<half>); break;
+ case clblast::Precision::kSingle: clblast::Tuner<float>(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings<float>, clblast::TestValidArguments<float>, clblast::SetConstraints, clblast::SetArguments<float>); break;
+ case clblast::Precision::kDouble: clblast::Tuner<double>(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings<double>, clblast::TestValidArguments<double>, clblast::SetConstraints, clblast::SetArguments<double>); break;
+ case clblast::Precision::kComplexSingle: clblast::Tuner<float2>(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings<float2>, clblast::TestValidArguments<float2>, clblast::SetConstraints, clblast::SetArguments<float2>); break;
+ case clblast::Precision::kComplexDouble: clblast::Tuner<double2>(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings<double2>, clblast::TestValidArguments<double2>, clblast::SetConstraints, clblast::SetArguments<double2>); break;
+ }
+ return 0;
+}
+
+// =================================================================================================