From 07a7012b0dc4824f95020dadd4cb32ffc8a34f7a Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 19 Dec 2017 21:10:48 +0100 Subject: Added skeleton for a tuner for the invert kernel --- CMakeLists.txt | 2 +- src/kernels/level3/invert_diagonal_blocks.opencl | 50 ++++++--- src/tuning/kernels/invert.cpp | 125 +++++++++++++++++++++++ 3 files changed, 160 insertions(+), 17 deletions(-) create mode 100644 src/tuning/kernels/invert.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b7c1238..9f7264f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,7 +193,7 @@ endif() # Sets the supported routines and the used kernels. New routines and kernels should be added here. set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger - xgemm xgemm_direct xgemv) + xgemm xgemm_direct xgemv invert) set(DATABASES copy pad padtranspose transpose xaxpy xdot xgemm xgemm_direct xgemv xgemv_fast xgemv_fast_rot xger gemm_routine) diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index db1513c1..e8f0ea91 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -56,8 +56,26 @@ R"( // ================================================================================================= #if defined(ROUTINE_INVERT) -#define LOCALX 17 // 16 + 1 to avoid bank conflicts -#define LOCALY 16 +// Parameters set by the tuner +// TODO: Make these actually tunable +#ifndef INTERNAL_BLOCK_SIZE + #define INTERNAL_BLOCK_SIZE 16 // Internal block size of the invert kernel +#endif +#ifndef LOCALPAD + #define LOCALPAD 0 // Padding in the x-dimension of the local memory to avoid bank conflicts +#endif +#ifndef LOCALX + #define LOCALX (16 + LOCALPAD) // Local memory size in x-dimension of TripleMatMul kernels +#endif +#ifndef LOCALY + #define LOCALY 16 // Local memory size in y-dimension of TripleMatMul kernels +#endif +#ifndef TMMWGSX + #define TMMWGSX 4 // Work-group size in x-dimension of TripleMatMul kernels +#endif +#ifndef TMMWGSY + #define TMMWGSY 4 // Work-group size in y-dimension of TripleMatMul kernels +#endif // ================================================================================================= @@ -172,7 +190,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, const int page = get_group_id(1) % num_pages; const int lidx = get_local_id(0); const int lidy = get_local_id(1); - const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); + const int ibx = get_group_id(0) * (get_local_size(0) * TMMWGSY); const int iby = by*16; const int id = lidx + lidy*get_local_size(0); const int row = page*current_size*2 + current_size + ibx + id; @@ -195,7 +213,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // Loads a 16 x 16 block of B into local memory using NX x 4 threads for (int i = 0; i < 16; i += (size/4) ) { // += get_local_size(0) - for (int _j = 0; _j < 16; _j += 4 ) { // += get_local_size(1) + for (int _j = 0; _j < 16; _j += TMMWGSY ) { // += get_local_size(1) blm[(lidx + i) * LOCALX + (lidy + _j)] = bgm[k + i + _j*ldb]; } } @@ -321,7 +339,7 @@ INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR r // ================================================================================================= // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -330,7 +348,7 @@ void TripleMatMul16Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -338,7 +356,7 @@ void TripleMatMul16Part2Lower(int n, __global real* restrict dest, int current_s } // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -347,7 +365,7 @@ void TripleMatMul32Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -355,7 +373,7 @@ void TripleMatMul32Part2Lower(int n, __global real* restrict dest, int current_s } // B21 = A21 * B11 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -364,7 +382,7 @@ void TripleMatMul64Part1Lower(int n, __global const real* restrict src, const in } // B21 = -B22 * B21 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -374,7 +392,7 @@ void TripleMatMul64Part2Lower(int n, __global real* restrict dest, int current_s // ================================================================================================= // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -383,7 +401,7 @@ void TripleMatMul16Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(4, 4, 1))) +__kernel __attribute__((reqd_work_group_size(1 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -391,7 +409,7 @@ void TripleMatMul16Part2Upper(int n, __global real* restrict dest, int current_s } // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -400,7 +418,7 @@ void TripleMatMul32Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(8, 4, 1))) +__kernel __attribute__((reqd_work_group_size(2 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; @@ -408,7 +426,7 @@ void TripleMatMul32Part2Upper(int n, __global real* restrict dest, int current_s } // B12 = A12 * B22 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const int a_offset, const int lda, __global real* restrict dest, int current_size, int num_pages, const int block_size) { @@ -417,7 +435,7 @@ void TripleMatMul64Part1Upper(int n, __global const real* restrict src, const in } // B12 = -B11 * B12 -__kernel __attribute__((reqd_work_group_size(16, 4, 1))) +__kernel __attribute__((reqd_work_group_size(4 * TMMWGSX, TMMWGSY, 1))) void TripleMatMul64Part2Upper(int n, __global real* restrict dest, int current_size, int num_pages, const int block_size) { __local real lm[LOCALY * LOCALX]; diff --git a/src/tuning/kernels/invert.cpp b/src/tuning/kernels/invert.cpp new file mode 100644 index 00000000..cce2fc8d --- /dev/null +++ b/src/tuning/kernels/invert.cpp @@ -0,0 +1,125 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file uses the auto-tuner to tune the invert OpenCL kernels. +// +// ================================================================================================= + +#include +#include + +#include "utilities/utilities.hpp" +#include "tuning/tuning.hpp" + +namespace clblast { +// ================================================================================================= + +// Settings for this kernel (default command-line arguments) +TunerDefaults GetTunerDefaults(const int) { + auto settings = TunerDefaults(); + settings.options = {kArgN, kArgM, kArgK}; + settings.default_n = 128; // dimension of input matrix + settings.default_m = 64; // block size + settings.default_k = 16; // current size + return settings; +} + +// Settings for this kernel (general) +template +TunerSettings GetTunerSettings(const int, const Arguments &args) { + auto settings = TunerSettings(); + + // Identification of the kernel + settings.kernel_family = "invert"; + settings.kernel_name = "TripleMatMul16Part1Lower"; + settings.sources = +"#define ROUTINE_INVERT" +#include "../src/kernels/level3/invert_diagonal_blocks.opencl" + ; + + // Buffer sizes + settings.size_a = args.n * args.a_ld + args.a_offset; + settings.size_b = CeilDiv(args.n, args.m) * args.m * args.m; + + // Inputs and outputs IDs (X:0, Y:1, A:2, B:3, C:4, temp:5) + settings.inputs = {2, 3}; + settings.outputs = {3}; + + // Sets the base thread configuration + const auto num_pages = CeilDiv(args.n, args.m*2); + settings.global_size = {args.k / 4, num_pages * (args.k / 16) * 4}; + settings.global_size_ref = settings.global_size; + settings.local_size = {1, 1}; + settings.local_size_ref = {4, 4}; + + // Transforms the thread configuration based on the parameters + settings.mul_local = {{"TMMWGSX", "TMMWGSY"}}; + settings.div_global = {{}}; + + // Sets the tuning parameters and their possible values + // TODO: Make these actually tunable, apart from LOCALPAD + settings.parameters = { + {"INTERNAL_BLOCK_SIZE", {16}}, + {"LOCALPAD", {0, 1}}, + {"TMMWGSX", {4}}, + {"TMMWGSY", {4}}, + }; + + // Describes how to compute the performance metrics + settings.metric_amount = 1 * GetBytes(args.precision); + settings.performance_unit = "N/A"; + + return settings; +} + +// Tests for valid arguments +template +void TestValidArguments(const int, const Arguments &args) { + if (!(args.k == 16)) { + throw std::runtime_error("'TripleMatMul16Part1Lower' requires 'k' to be 16"); + } +} +std::vector SetConstraints(const int) { return {}; } + +// Sets the kernel's arguments +template +void SetArguments(const int, Kernel &kernel, const Arguments &args, std::vector>& buffers) { + const auto num_pages = CeilDiv(args.n, args.m*2); + kernel.SetArgument(0, static_cast(args.n)); + kernel.SetArgument(1, buffers[0]()); // 0 == A matrix + kernel.SetArgument(2, 0); // a_offset + kernel.SetArgument(3, static_cast(args.n)); // a_ld + kernel.SetArgument(4, buffers[1]()); // 1 == B matrix + kernel.SetArgument(5, static_cast(args.k)); // current_size + kernel.SetArgument(6, static_cast(num_pages)); // num_pages + kernel.SetArgument(7, static_cast(args.m)); // block_size +} + +// ================================================================================================= +} // namespace clblast + +// Shortcuts to the clblast namespace +using half = clblast::half; +using float2 = clblast::float2; +using double2 = clblast::double2; + +// Main function (not within the clblast namespace) +int main(int argc, char *argv[]) { + const auto command_line_args = clblast::RetrieveCommandLineArguments(argc, argv); + switch(clblast::GetPrecision(command_line_args)) { + case clblast::Precision::kHalf: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexSingle: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + case clblast::Precision::kComplexDouble: clblast::Tuner(argc, argv, 0, clblast::GetTunerDefaults, clblast::GetTunerSettings, clblast::TestValidArguments, clblast::SetConstraints, clblast::SetArguments); break; + } + return 0; +} + +// ================================================================================================= -- cgit v1.2.3