From eaa348735ee5cee396f9ec629f1486ebb3dbeff7 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 16 Jul 2016 15:18:28 +0200 Subject: Created infrastructure to support a direct GEMM kernel; added correct but slow reference kernel as a place-holder --- src/kernels/level3/xgemm_direct.opencl | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 src/kernels/level3/xgemm_direct.opencl (limited to 'src/kernels/level3') diff --git a/src/kernels/level3/xgemm_direct.opencl b/src/kernels/level3/xgemm_direct.opencl new file mode 100644 index 00000000..9d2a55c8 --- /dev/null +++ b/src/kernels/level3/xgemm_direct.opencl @@ -0,0 +1,71 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This is a generic GEMM kernel that works for all sizes and configurations: it doesn't require any +// pre and and post-processing kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Main entry point of the kernel. This is the direct version. +__attribute__((reqd_work_group_size(16, 16, 1))) +__kernel void XgemmDirect(const int kSizeM, const int kSizeN, const int kSizeK, + const real_arg arg_alpha, + const real_arg arg_beta, + const __global real* restrict agm, const int a_offset, const int a_ld, + const __global real* restrict bgm, const int b_offset, const int b_ld, + __global real* cgm, const int c_offset, const int c_ld, + const int a_transpose, const int b_transpose, const int c_transpose, + const int a_conjugate, const int b_conjugate) { + const real alpha = GetRealArg(arg_alpha); + const real beta = GetRealArg(arg_beta); + + // Thread identifiers + const int mid = get_global_id(0); // Row ID of cgm + const int nid = get_global_id(1); // Col ID of cgm + + // Allows for incomplete workgroups + if (mid < kSizeM && nid < kSizeN) { + + // Computes a single element + real acc; + SetToZero(acc); + for (int k=0; k