From eaa348735ee5cee396f9ec629f1486ebb3dbeff7 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 16 Jul 2016 15:18:28 +0200 Subject: Created infrastructure to support a direct GEMM kernel; added correct but slow reference kernel as a place-holder --- src/routines/level3/xgemm.cpp | 94 +++++++++++++++++++++++++++++++++++++++++++ src/routines/level3/xgemm.hpp | 23 +++++++++++ 2 files changed, 117 insertions(+) (limited to 'src/routines') diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 0db28537..4bdf3192 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -34,6 +34,7 @@ Xgemm::Xgemm(Queue &queue, EventPointer event, const std::string &name): #include "../../kernels/level3/convert_hermitian.opencl" #include "../../kernels/level3/xgemm_part1.opencl" #include "../../kernels/level3/xgemm_part2.opencl" + #include "../../kernels/level3/xgemm_direct.opencl" ; } @@ -94,6 +95,42 @@ StatusCode Xgemm::DoGemm(const Layout layout, status = TestMatrixC(c_one, c_two, c_buffer, c_offset, c_ld); if (ErrorIn(status)) { return status; } + // Optionally runs the direct version of GEMM. TODO: Set this based on the arguments + const auto do_gemm_direct = true; // for now, for testing + if (do_gemm_direct) { + return GemmDirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate); + } + else { + return GemmIndirect(m, n, k, alpha, + a_buffer, a_offset, a_ld, b_buffer, b_offset, b_ld, beta, + c_buffer, c_offset, c_ld, + a_do_transpose, b_do_transpose, c_do_transpose, a_conjugate, b_conjugate, + a_one, a_two, b_one, b_two, c_one, c_two); + } +} + +// ================================================================================================= + +// The indirect version of GEMM. This uses the faster but non-general kernel. It has specific +// requirements, but several pre and post-processing kernels take care of those. However, the +// overhead of these extra kernels might not be ideal for certain devices/arguments. +template +StatusCode Xgemm::GemmIndirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, + const size_t b_one, const size_t b_two, + const size_t c_one, const size_t c_two) { + auto status = StatusCode::kSuccess; + // Calculates the ceiled versions of m, n, and k const auto m_ceiled = Ceil(m, db_["MWG"]); const auto n_ceiled = Ceil(n, db_["NWG"]); @@ -204,6 +241,63 @@ StatusCode Xgemm::DoGemm(const Layout layout, } catch (...) { return StatusCode::kTempBufferAllocFailure; } } + +// ================================================================================================= + +// The direct version of GEMM, requiring just one kernel, no pre or post-processing kernels. +template +StatusCode Xgemm::GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate) { + + // Loads the program from the database + const auto program = GetProgramFromCache(context_, PrecisionValue(), routine_name_); + + // Retrieves the XgemmDirect kernel from the compiled binary + try { + auto kernel = Kernel(program, "XgemmDirect"); + + // Sets the kernel arguments + kernel.SetArgument(0, static_cast(m)); + kernel.SetArgument(1, static_cast(n)); + kernel.SetArgument(2, static_cast(k)); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); + kernel.SetArgument(5, a_buffer()); + kernel.SetArgument(6, static_cast(a_offset)); + kernel.SetArgument(7, static_cast(a_ld)); + kernel.SetArgument(8, b_buffer()); + kernel.SetArgument(9, static_cast(b_offset)); + kernel.SetArgument(10, static_cast(b_ld)); + kernel.SetArgument(11, c_buffer()); + kernel.SetArgument(12, static_cast(c_offset)); + kernel.SetArgument(13, static_cast(c_ld)); + kernel.SetArgument(14, static_cast(a_do_transpose)); + kernel.SetArgument(15, static_cast(b_do_transpose)); + kernel.SetArgument(16, static_cast(c_do_transpose)); + kernel.SetArgument(17, static_cast(a_conjugate)); + kernel.SetArgument(18, static_cast(b_conjugate)); + + // Computes the global and local thread sizes + const auto m_ceiled = Ceil(m, 16); + const auto n_ceiled = Ceil(n, 16); + const auto global = std::vector{m_ceiled, n_ceiled}; + const auto local = std::vector{16, 16}; + + // Launches the kernel + auto status = RunKernel(kernel, queue_, device_, global, local, event_); + if (ErrorIn(status)) { return status; } + + // Successfully finished the computation + return StatusCode::kSuccess; + } catch (...) { return StatusCode::kInvalidKernel; } +} + // ================================================================================================= // Compiles the templated class diff --git a/src/routines/level3/xgemm.hpp b/src/routines/level3/xgemm.hpp index bc51c7f5..8db1cb11 100644 --- a/src/routines/level3/xgemm.hpp +++ b/src/routines/level3/xgemm.hpp @@ -35,6 +35,29 @@ class Xgemm: public Routine { const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, const Buffer &c_buffer, const size_t c_offset, const size_t c_ld); + + // Indirect version of GEMM (with pre and post-processing kernels) + StatusCode GemmIndirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate, + const size_t a_one, const size_t a_two, + const size_t b_one, const size_t b_two, + const size_t c_one, const size_t c_two); + + // Direct version of GEMM (no pre and post-processing kernels) + StatusCode GemmDirect(const size_t m, const size_t n, const size_t k, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const T beta, + const Buffer &c_buffer, const size_t c_offset, const size_t c_ld, + const bool a_do_transpose, const bool b_do_transpose, const bool c_do_transpose, + const bool a_conjugate, const bool b_conjugate); }; // ================================================================================================= -- cgit v1.2.3