From bc5a341dfe591946e925db315fc7d8c0c25c2938 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Sat, 30 May 2015 12:30:43 +0200 Subject: Initial commit of preview version --- src/kernels/common.opencl | 120 +++++++++ src/kernels/copy.opencl | 73 +++++ src/kernels/pad.opencl | 180 +++++++++++++ src/kernels/padtranspose.opencl | 150 +++++++++++ src/kernels/transpose.opencl | 168 ++++++++++++ src/kernels/xaxpy.opencl | 128 +++++++++ src/kernels/xgemm.opencl | 570 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1389 insertions(+) create mode 100644 src/kernels/common.opencl create mode 100644 src/kernels/copy.opencl create mode 100644 src/kernels/pad.opencl create mode 100644 src/kernels/padtranspose.opencl create mode 100644 src/kernels/transpose.opencl create mode 100644 src/kernels/xaxpy.opencl create mode 100644 src/kernels/xgemm.opencl (limited to 'src/kernels') diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl new file mode 100644 index 00000000..154265e4 --- /dev/null +++ b/src/kernels/common.opencl @@ -0,0 +1,120 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the common defines and type-defs for the CLBlast OpenCL kernels. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this file is used outside of the CLBlast library. +#ifndef PRECISION + #define PRECISION 32 // Data-types: single or double precision, complex or regular +#endif + +// ================================================================================================= + +// Enable support for double-precision +#if PRECISION == 64 || PRECISION == 6464 + #if __OPENCL_VERSION__ <= CL_VERSION_1_1 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif +#endif + +// Single-precision +#if PRECISION == 32 + typedef float real; + typedef float2 real2; + typedef float4 real4; + typedef float8 real8; + typedef float16 real16; + #define ZERO 0.0f + +// Double-precision +#elif PRECISION == 64 + typedef double real; + typedef double2 real2; + typedef double4 real4; + typedef double8 real8; + typedef double16 real16; + #define ZERO 0.0 + +// Complex single-precision +#elif PRECISION == 3232 + typedef struct cfloat {float x; float y;} real; + typedef struct cfloat2 {real x; real y;} real2; + typedef struct cfloat4 {real x; real y; real z; real w;} real4; + typedef struct cfloat8 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7;} real8; + typedef struct cfloat16 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7; + real s8; real s9; real sA; real sB; + real sC; real sD; real sE; real sF;} real16; + #define ZERO 0.0f + +// Complex Double-precision +#elif PRECISION == 6464 + typedef struct cdouble {double x; double y;} real; + typedef struct cdouble2 {real x; real y;} real2; + typedef struct cdouble4 {real x; real y; real z; real w;} real4; + typedef struct cdouble8 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7;} real8; + typedef struct cdouble16 {real s0; real s1; real s2; real s3; + real s4; real s5; real s6; real s7; + real s8; real s9; real sA; real sB; + real sC; real sD; real sE; real sF;} real16; + #define ZERO 0.0 +#endif + +// ================================================================================================= + +// Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction +#define USE_CL_MAD 0 + +// Sets a variable to zero +#if PRECISION == 3232 || PRECISION == 6464 + #define SetToZero(a) a.x = ZERO; a.y = ZERO +#else + #define SetToZero(a) a = ZERO +#endif + +// Multiply two complex variables (used in the define below) +#if PRECISION == 3232 || PRECISION == 6464 + #define MulReal(a, b) a.x*b.x - a.y*b.y + #define MulImag(a, b) a.x*b.y + a.y*b.x +#endif + +// The scalar multiply-add function +#if PRECISION == 3232 || PRECISION == 6464 + #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b) +#else + #if USE_CL_MAD == 1 + #define MultiplyAdd(c, a, b) c = mad(a, b, c) + #else + #define MultiplyAdd(c, a, b) c += a * b + #endif +#endif + +// The scalar AXPBY function +#if PRECISION == 3232 || PRECISION == 6464 + #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d) +#else + #define AXPBY(e, a, b, c, d) e = a*b + c*d +#endif + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl new file mode 100644 index 00000000..f95b476b --- /dev/null +++ b/src/kernels/copy.opencl @@ -0,0 +1,73 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef COPY_DIMX + #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef COPY_DIMY + #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef COPY_WPT + #define COPY_WPT 1 // Work per thread in the first dimension (x) +#endif +#ifndef COPY_VW + #define COPY_VW 1 // Vector width in the second dimension (y) +#endif + +// ================================================================================================= + +// Data-widths +#if COPY_VW == 1 + typedef real realC; +#elif COPY_VW == 2 + typedef real2 realC; +#elif COPY_VW == 4 + typedef real4 realC; +#elif COPY_VW == 8 + typedef real8 realC; +#elif COPY_VW == 16 + typedef real16 realC; +#endif + +// ================================================================================================= + +// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of +// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. +__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +__kernel void CopyMatrix(const int ld, + __global const realC* restrict src, + __global realC* dest) { + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy and pad matrices in various ways, including: +// 1) copying into a larger matrix by adding padding +// 2) copying into a smaller matrix by removing padding +// 3) from upper/lower triangle into a full matrix +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// ================================================================================================= + +// Copies a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld +// value and offset can be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void PadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices in various ways, including: +// 1) transposing into a larger matrix by adding padding +// 2) transposing into a smaller matrix by removing padding +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PADTRA_TILE + #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef PADTRA_WPT + #define PADTRA_WPT 1 // Amount of work per thread +#endif +#ifndef PADTRA_PAD + #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// Same as PadCopyMatrix, but now also does the transpose +__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +__kernel void PadTransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef TRA_DIM + #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef TRA_WPT + #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other +#endif +#ifndef TRA_PAD + #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// Data-widths +#if TRA_WPT == 1 + typedef real realT; +#elif TRA_WPT == 2 + typedef real2 realT; +#elif TRA_WPT == 4 + typedef real4 realT; +#elif TRA_WPT == 8 + typedef real8 realT; +#elif TRA_WPT == 16 + typedef real16 realT; +#endif + +// ================================================================================================= + +// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without +// offset. A more general version is available in 'padtranspose.opencl'. +__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +__kernel void TransposeMatrix(const int ld, + __global const realT* restrict src, + __global realT* dest) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[TRA_WPT*TRA_DIM][TRA_WPT*TRA_DIM + TRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit +// strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't +// support vector data-types. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef WGS + #define WGS 64 // The local work-group size +#endif +#ifndef WPT + #define WPT 1 // The amount of work-per-thread +#endif +#ifndef VW + #define VW 1 // Vector width of vectors X and Y +#endif + +// ================================================================================================= + +// Data-widths +#if VW == 1 + typedef real realV; +#elif VW == 2 + typedef real2 realV; +#elif VW == 4 + typedef real4 realV; +#elif VW == 8 + typedef real8 realV; +#elif VW == 16 + typedef real16 realV; +#endif + +// ================================================================================================= + +// The vectorized multiply-add function +inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { + #if VW == 1 + MultiplyAdd(cvec, aval, bvec); + #elif VW == 2 + MultiplyAdd(cvec.x, aval, bvec.x); + MultiplyAdd(cvec.y, aval, bvec.y); + #elif VW == 4 + MultiplyAdd(cvec.x, aval, bvec.x); + MultiplyAdd(cvec.y, aval, bvec.y); + MultiplyAdd(cvec.z, aval, bvec.z); + MultiplyAdd(cvec.w, aval, bvec.w); + #elif VW == 8 + MultiplyAdd(cvec.s0, aval, bvec.s0); + MultiplyAdd(cvec.s1, aval, bvec.s1); + MultiplyAdd(cvec.s2, aval, bvec.s2); + MultiplyAdd(cvec.s3, aval, bvec.s3); + MultiplyAdd(cvec.s4, aval, bvec.s4); + MultiplyAdd(cvec.s5, aval, bvec.s5); + MultiplyAdd(cvec.s6, aval, bvec.s6); + MultiplyAdd(cvec.s7, aval, bvec.s7); + #elif VW == 16 + MultiplyAdd(cvec.s0, aval, bvec.s0); + MultiplyAdd(cvec.s1, aval, bvec.s1); + MultiplyAdd(cvec.s2, aval, bvec.s2); + MultiplyAdd(cvec.s3, aval, bvec.s3); + MultiplyAdd(cvec.s4, aval, bvec.s4); + MultiplyAdd(cvec.s5, aval, bvec.s5); + MultiplyAdd(cvec.s6, aval, bvec.s6); + MultiplyAdd(cvec.s7, aval, bvec.s7); + MultiplyAdd(cvec.s8, aval, bvec.s8); + MultiplyAdd(cvec.s9, aval, bvec.s9); + MultiplyAdd(cvec.sA, aval, bvec.sA); + MultiplyAdd(cvec.sB, aval, bvec.sB); + MultiplyAdd(cvec.sC, aval, bvec.sC); + MultiplyAdd(cvec.sD, aval, bvec.sD); + MultiplyAdd(cvec.sE, aval, bvec.sE); + MultiplyAdd(cvec.sF, aval, bvec.sF); + #endif + return cvec; +} + +// ================================================================================================= + +// Full version of the kernel with offsets and strided accesses +__attribute__((reqd_work_group_size(WGS, 1, 1))) +__kernel void Xaxpy(const int n, const real alpha, + const __global real* restrict xgm, const int x_offset, const int x_inc, + __global real* ygm, const int y_offset, const int y_inc) { + + // Loops over the work that needs to be done (allows for an arbitrary number of threads) + #pragma unroll + for (int id = get_global_id(0); id +// +// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto +// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable +// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It +// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. +// +// Matrices are accessed as follows: +// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) +// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) +// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) +// +// Or as an image (assuming column-major) +// K +// o-------o +// | | +// N | [B^T] | +// | | +// o-------o +// K N +// o-------o o-----o +// M | [A] | M | [C] | +// | | | | +// o-------o o-----o +// +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef MWG + #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) +#endif +#ifndef NWG + #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) +#endif +#ifndef KWG + #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) +#endif +#ifndef MDIMC + #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMC + #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMA + #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#endif +#ifndef NDIMB + #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#endif +#ifndef KWI + #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) +#endif +#ifndef VWM + #define VWM 1 // Vector width of matrices A and C +#endif +#ifndef VWN + #define VWN 1 // Vector width of matrix B +#endif +#ifndef STRM + #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) +#endif +#ifndef STRN + #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) +#endif +#ifndef SA + #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) +#endif +#ifndef SB + #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) +#endif + +// Helper parameters based on the above tuning parameters +#define MWI (MWG/MDIMC) // Work per work-item (M-dimension) +#define NWI (NWG/NDIMC) // Work per work-item (N-dimension) +#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) + +// Settings +#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually + +// ================================================================================================= + +// Data-widths in dimension M +#if VWM == 1 + typedef real realM; +#elif VWM == 2 + typedef real2 realM; +#elif VWM == 4 + typedef real4 realM; +#elif VWM == 8 + typedef real8 realM; +#elif VWM == 16 + typedef real16 realM; +#endif + +// Data-widths in dimension N +#if VWN == 1 + typedef real realN; +#elif VWN == 2 + typedef real2 realN; +#elif VWN == 4 + typedef real4 realN; +#elif VWN == 8 + typedef real8 realN; +#elif VWN == 16 + typedef real16 realN; +#endif + +// ================================================================================================= + +// Caches global off-chip memory into local (shared) memory on-chip. This function is specific for +// caching the A input matrix. +#if SA == 1 +inline void GlobalToLocalA(const __global realM* restrict agm, __local realM* alm, + const int kSizeM, const int tid, const int kwg) { + const int la0 = tid % MDIMA; + const int la1 = tid / MDIMA; + #pragma unroll + for (int mia=0; mia local (matrix A) + #if SA == 1 + GlobalToLocalA(agm, alm, kSizeM, tid, kwg); + #endif + // Loads data: off-chip --> local (matrix B) + #if SB == 1 + GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); + #endif + + // Synchronizes all threads in a workgroup + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi private (matrix A) + #if SA == 1 + LocalToPrivateA(alm, apm, kg); + // Loads data: off-chip --> private (matrix A) + #else + GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + #endif + + // Loads data: local --> private (matrix B) + #if SB == 1 + LocalToPrivateB(blm, bpm, kg); + // Loads data: off-chip --> private (matrix B) + #else + GlobalToPrivateB(bgm, bpm, kSizeN, idk); + #endif + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulate(cpm, apm, bpm); + } + } + + // Synchronizes all threads in a workgroup + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + } + + // Stores an MWG * NWG tile of results and perform the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeM, alpha, beta); +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)"; + +// ================================================================================================= -- cgit v1.2.3