From 54a8723f8cd4f34a08d651216d680578ffc47fa5 Mon Sep 17 00:00:00 2001 From: CNugteren Date: Mon, 12 Oct 2015 08:28:40 +0200 Subject: Moved level3 kernel files to a subfolder --- src/kernels/copy.opencl | 73 ---- src/kernels/level3/copy.opencl | 73 ++++ src/kernels/level3/pad.opencl | 349 +++++++++++++++++ src/kernels/level3/padtranspose.opencl | 164 ++++++++ src/kernels/level3/transpose.opencl | 149 +++++++ src/kernels/level3/xgemm.opencl | 683 +++++++++++++++++++++++++++++++++ src/kernels/pad.opencl | 349 ----------------- src/kernels/padtranspose.opencl | 164 -------- src/kernels/transpose.opencl | 149 ------- src/kernels/xgemm.opencl | 683 --------------------------------- src/routines/level3/xgemm.cc | 10 +- src/routines/level3/xher2k.cc | 10 +- src/routines/level3/xherk.cc | 10 +- src/routines/level3/xsyr2k.cc | 10 +- src/routines/level3/xsyrk.cc | 10 +- src/tuning/copy.cc | 2 +- src/tuning/pad.cc | 2 +- src/tuning/padtranspose.cc | 2 +- src/tuning/transpose.cc | 2 +- src/tuning/xgemm.cc | 2 +- 20 files changed, 1448 insertions(+), 1448 deletions(-) delete mode 100644 src/kernels/copy.opencl create mode 100644 src/kernels/level3/copy.opencl create mode 100644 src/kernels/level3/pad.opencl create mode 100644 src/kernels/level3/padtranspose.opencl create mode 100644 src/kernels/level3/transpose.opencl create mode 100644 src/kernels/level3/xgemm.opencl delete mode 100644 src/kernels/pad.opencl delete mode 100644 src/kernels/padtranspose.opencl delete mode 100644 src/kernels/transpose.opencl delete mode 100644 src/kernels/xgemm.opencl (limited to 'src') diff --git a/src/kernels/copy.opencl b/src/kernels/copy.opencl deleted file mode 100644 index 7dde688b..00000000 --- a/src/kernels/copy.opencl +++ /dev/null @@ -1,73 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy matrices. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef COPY_DIMX - #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef COPY_DIMY - #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef COPY_WPT - #define COPY_WPT 1 // Work per thread in the first dimension (x) -#endif -#ifndef COPY_VW - #define COPY_VW 1 // Vector width in the second dimension (y) -#endif - -// ================================================================================================= - -// Data-widths -#if COPY_VW == 1 - typedef real realC; -#elif COPY_VW == 2 - typedef real2 realC; -#elif COPY_VW == 4 - typedef real4 realC; -#elif COPY_VW == 8 - typedef real8 realC; -#elif COPY_VW == 16 - typedef real16 realC; -#endif - -// ================================================================================================= - -// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of -// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. -__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) -__kernel void CopyMatrix(const int ld, - __global const realC* restrict src, - __global realC* dest) { - #pragma unroll - for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef COPY_DIMX + #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef COPY_DIMY + #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef COPY_WPT + #define COPY_WPT 1 // Work per thread in the first dimension (x) +#endif +#ifndef COPY_VW + #define COPY_VW 1 // Vector width in the second dimension (y) +#endif + +// ================================================================================================= + +// Data-widths +#if COPY_VW == 1 + typedef real realC; +#elif COPY_VW == 2 + typedef real2 realC; +#elif COPY_VW == 4 + typedef real4 realC; +#elif COPY_VW == 8 + typedef real8 realC; +#elif COPY_VW == 16 + typedef real16 realC; +#endif + +// ================================================================================================= + +// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of +// COPY_VW. Also requires both matrices to be of the same dimensions and without offset. +__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) +__kernel void CopyMatrix(const int ld, + __global const realC* restrict src, + __global realC* dest) { + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS routines. This file contains +// kernels to copy and pad matrices in various ways, including: +// 1) copying into a larger matrix by adding padding +// 2) copying into a smaller matrix by removing padding +// 3) from upper/lower triangle into a full matrix +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PAD_DIMX + #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) +#endif +#ifndef PAD_DIMY + #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) +#endif +#ifndef PAD_WPTX + #define PAD_WPTX 1 // Work per thread in the first dimension (x) +#endif +#ifndef PAD_WPTY + #define PAD_WPTY 1 // Work per thread in the second dimension (y) +#endif + +// ================================================================================================= + +// Copies a matrix from source to destination. The output is padded with zero values in case the +// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld +// value and offset can be different. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void PadMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one= id_one); } + else if (lower == 1) { condition = (id_two <= id_one); } + if (condition) { + + // Copies the value into the destination matrix. This is always within bounds of the source + // matrix, as we know that the destination matrix is smaller than the source. + if (id_two < dest_two && id_one < dest_one) { + real value = src[id_two*src_ld + id_one + src_offset]; + if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } + dest[id_two*dest_ld + id_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is +// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. +__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) +__kernel void SymmLowerToSquared(const int src_dim, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_dim, + const int dest_ld, const int dest_offset, + __global real* dest) { + + // Loops over the work per thread in both dimensions + #pragma unroll + for (int w_one=0; w_one +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices in various ways, including: +// 1) transposing into a larger matrix by adding padding +// 2) transposing into a smaller matrix by removing padding +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef PADTRA_TILE + #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef PADTRA_WPT + #define PADTRA_WPT 1 // Amount of work per thread +#endif +#ifndef PADTRA_PAD + #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif + +// ================================================================================================= + +// Same as PadCopyMatrix, but now also does the transpose +__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) +__kernel void PadTransposeMatrix(const int src_one, const int src_two, + const int src_ld, const int src_offset, + __global const real* restrict src, + const int dest_one, const int dest_two, + const int dest_ld, const int dest_offset, + __global real* dest, + const int do_conjugate) { + + // Local memory to store a tile of the matrix (for coalescing) + __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; + + // Loop over the work per thread + #pragma unroll + for (int w_one=0; w_one= id_dest_two); } + else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } + if (condition) { + + // Stores the transposed value in the destination matrix + if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { + real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; + if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } + dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; + } + } + } + } +} + +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/level3/transpose.opencl b/src/kernels/level3/transpose.opencl new file mode 100644 index 00000000..d726f7ec --- /dev/null +++ b/src/kernels/level3/transpose.opencl @@ -0,0 +1,149 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren +// +// This file contains the common kernels shared among different BLAS functions. This file contains +// kernels to transpose matrices. +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef TRA_DIM + #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) +#endif +#ifndef TRA_WPT + #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other +#endif +#ifndef TRA_PAD + #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts +#endif +#ifndef TRA_SHUFFLE + #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts +#endif + +// ================================================================================================= + +// Data-widths +#if TRA_WPT == 1 + typedef real realT; +#elif TRA_WPT == 2 + typedef real2 realT; +#elif TRA_WPT == 4 + typedef real4 realT; +#elif TRA_WPT == 8 + typedef real8 realT; +#elif TRA_WPT == 16 + typedef real16 realT; +#endif + +// ================================================================================================= + +// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without +// offset. A more general version is available in 'padtranspose.opencl'. +__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) +__kernel void TransposeMatrix(const int ld, + __global const realT* restrict src, + __global realT* dest) { + + // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different + // way over workgroups, breaking memory-bank dependencies. + const int gid0 = get_group_id(0); + #if TRA_SHUFFLE == 1 + const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); + #else + const int gid1 = get_group_id(1); + #endif + + // Local memory to store a tile of the matrix (for coalescing) + __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; + + // Loops over the work per thread + #pragma unroll + for (int w_one=0; w_one +// +// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto +// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable +// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It +// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. +// +// Matrices are accessed as follows: +// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) +// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) +// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) +// +// Or as an image (assuming column-major) +// K +// o-------o +// | | +// N | [B^T] | +// | | +// o-------o +// K N +// o-------o o-----o +// M | [A] | M | [C] | +// | | | | +// o-------o o-----o +// +// +// ================================================================================================= + +// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string +// literal). Comment-out this line for syntax-highlighting when developing. +R"( + +// ================================================================================================= + +// Parameters set by the tuner or by the database. Here they are given a basic default value in case +// this kernel file is used outside of the CLBlast library. +#ifndef MWG + #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) +#endif +#ifndef NWG + #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) +#endif +#ifndef KWG + #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) +#endif +#ifndef MDIMC + #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) +#endif +#ifndef NDIMC + #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) +#endif +#ifndef MDIMA + #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#endif +#ifndef NDIMB + #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#endif +#ifndef KWI + #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) +#endif +#ifndef VWM + #define VWM 1 // Vector width of matrices A and C +#endif +#ifndef VWN + #define VWN 1 // Vector width of matrix B +#endif +#ifndef STRM + #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) +#endif +#ifndef STRN + #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) +#endif +#ifndef SA + #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) +#endif +#ifndef SB + #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) +#endif + +// Helper parameters based on the above tuning parameters +#define MWI (MWG/MDIMC) // Work per work-item (M-dimension) +#define NWI (NWG/NDIMC) // Work per work-item (N-dimension) +#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA +#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB +#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) +#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) +#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) +#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) + +// Settings +#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually + +// ================================================================================================= + +// Data-widths in dimension M +#if VWM == 1 + typedef real realM; +#elif VWM == 2 + typedef real2 realM; +#elif VWM == 4 + typedef real4 realM; +#elif VWM == 8 + typedef real8 realM; +#elif VWM == 16 + typedef real16 realM; +#endif + +// Data-widths in dimension N +#if VWN == 1 + typedef real realN; +#elif VWN == 2 + typedef real2 realN; +#elif VWN == 4 + typedef real4 realN; +#elif VWN == 8 + typedef real8 realN; +#elif VWN == 16 + typedef real16 realN; +#endif + +// ================================================================================================= + +// Initializes the accumulation registers to zero +inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { + #pragma unroll + for (int mi=0; mi local (matrix A) + #if SA == 1 + GlobalToLocalA(agm, alm, kSizeM, tid, kwg); + #endif + // Loads data: off-chip --> local (matrix B) + #if SB == 1 + GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); + #endif + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + + // Loops over all workitem tiles, unrolled by a factor KWI + for (int pwi=0; pwi private (matrix A) + #if SA == 1 + LocalToPrivateA(alm, apm, kg); + // Loads data: off-chip --> private (matrix A) + #else + GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); + #endif + + // Loads data: local --> private (matrix B) + #if SB == 1 + LocalToPrivateB(blm, bpm, kg); + // Loads data: off-chip --> private (matrix B) + #else + GlobalToPrivateB(bgm, bpm, kSizeN, idk); + #endif + + // Performs the accumulation (Cpm += Apm * Bpm) + MultiplyAccumulate(cpm, apm, bpm); + } + } + #if SA == 1 || SB == 1 + barrier(CLK_LOCAL_MEM_FENCE); + #endif + } +} + +// ================================================================================================= +// The upper-triangular and lower-triangular kernels are only used in special cases +#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) + +// Main entry point of the kernel. This is the upper-triangular version. +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void XgemmUpper(const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Skip these threads if they do not contain threads contributing to the upper-triangle + if (get_group_id(1)*NWG < get_group_id(0)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// Main entry point of the kernel. This is the lower-triangular version. +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void XgemmLower(const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Skip these threads if they do not contain threads contributing to the lower-triangle + if (get_group_id(1)*NWG > get_group_id(0)*MWG) { + return; + } + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeN, alpha, beta); +} + +// ================================================================================================= +// If not using a triangular version, include the regular kernel +#else + +// Main entry point of the kernel. This is the regular full version. +__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) +__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, + const real alpha, const real beta, + const __global realM* restrict agm, + const __global realN* restrict bgm, + __global realM* cgm) { + + // Allocates workgroup-private memory (local memory) + #if SA == 1 + __local realM alm[KWG * MWG/VWM]; + #endif + #if SB == 1 + __local realN blm[KWG * NWG/VWN]; + #endif + + // Computes the matrix-multiplication and stores the result in register memory + realM cpm[NWI][MWI/VWM]; + #if SA == 1 && SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); + #elif SA == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); + #elif SB == 1 + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); + #else + XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); + #endif + + // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta + StoreResults(cgm, cpm, kSizeM, alpha, beta); +} + +#endif +// ================================================================================================= + +// End of the C++11 raw string literal +)" + +// ================================================================================================= diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl deleted file mode 100644 index 69324f20..00000000 --- a/src/kernels/pad.opencl +++ /dev/null @@ -1,349 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS routines. This file contains -// kernels to copy and pad matrices in various ways, including: -// 1) copying into a larger matrix by adding padding -// 2) copying into a smaller matrix by removing padding -// 3) from upper/lower triangle into a full matrix -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PAD_DIMX - #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) -#endif -#ifndef PAD_DIMY - #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) -#endif -#ifndef PAD_WPTX - #define PAD_WPTX 1 // Work per thread in the first dimension (x) -#endif -#ifndef PAD_WPTY - #define PAD_WPTY 1 // Work per thread in the second dimension (y) -#endif - -// ================================================================================================= - -// Copies a matrix from source to destination. The output is padded with zero values in case the -// destination matrix dimensions are larger than the source matrix dimensions. Additionally, the ld -// value and offset can be different. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void PadMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one= id_one); } - else if (lower == 1) { condition = (id_two <= id_one); } - if (condition) { - - // Copies the value into the destination matrix. This is always within bounds of the source - // matrix, as we know that the destination matrix is smaller than the source. - if (id_two < dest_two && id_one < dest_one) { - real value = src[id_two*src_ld + id_one + src_offset]; - if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); } - dest[id_two*dest_ld + id_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= - -// Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is -// stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. -__attribute__((reqd_work_group_size(PAD_DIMX, PAD_DIMY, 1))) -__kernel void SymmLowerToSquared(const int src_dim, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_dim, - const int dest_ld, const int dest_offset, - __global real* dest) { - - // Loops over the work per thread in both dimensions - #pragma unroll - for (int w_one=0; w_one -// -// This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices in various ways, including: -// 1) transposing into a larger matrix by adding padding -// 2) transposing into a smaller matrix by removing padding -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef PADTRA_TILE - #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef PADTRA_WPT - #define PADTRA_WPT 1 // Amount of work per thread -#endif -#ifndef PADTRA_PAD - #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif - -// ================================================================================================= - -// Same as PadCopyMatrix, but now also does the transpose -__attribute__((reqd_work_group_size(PADTRA_TILE, PADTRA_TILE, 1))) -__kernel void PadTransposeMatrix(const int src_one, const int src_two, - const int src_ld, const int src_offset, - __global const real* restrict src, - const int dest_one, const int dest_two, - const int dest_ld, const int dest_offset, - __global real* dest, - const int do_conjugate) { - - // Local memory to store a tile of the matrix (for coalescing) - __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD]; - - // Loop over the work per thread - #pragma unroll - for (int w_one=0; w_one= id_dest_two); } - else if (lower == 1) { condition = (id_dest_one <= id_dest_two); } - if (condition) { - - // Stores the transposed value in the destination matrix - if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) { - real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one]; - if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); } - dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value; - } - } - } - } -} - -// ================================================================================================= - -// End of the C++11 raw string literal -)" - -// ================================================================================================= diff --git a/src/kernels/transpose.opencl b/src/kernels/transpose.opencl deleted file mode 100644 index d726f7ec..00000000 --- a/src/kernels/transpose.opencl +++ /dev/null @@ -1,149 +0,0 @@ - -// ================================================================================================= -// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This -// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- -// width of 100 characters per line. -// -// Author(s): -// Cedric Nugteren -// -// This file contains the common kernels shared among different BLAS functions. This file contains -// kernels to transpose matrices. -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef TRA_DIM - #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) -#endif -#ifndef TRA_WPT - #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other -#endif -#ifndef TRA_PAD - #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts -#endif -#ifndef TRA_SHUFFLE - #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts -#endif - -// ================================================================================================= - -// Data-widths -#if TRA_WPT == 1 - typedef real realT; -#elif TRA_WPT == 2 - typedef real2 realT; -#elif TRA_WPT == 4 - typedef real4 realT; -#elif TRA_WPT == 8 - typedef real8 realT; -#elif TRA_WPT == 16 - typedef real16 realT; -#endif - -// ================================================================================================= - -// Transposes and copies a matrix. Requires both matrices to be of the same dimensions and without -// offset. A more general version is available in 'padtranspose.opencl'. -__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1))) -__kernel void TransposeMatrix(const int ld, - __global const realT* restrict src, - __global realT* dest) { - - // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different - // way over workgroups, breaking memory-bank dependencies. - const int gid0 = get_group_id(0); - #if TRA_SHUFFLE == 1 - const int gid1 = (get_group_id(0) + get_group_id(1)) % get_num_groups(0); - #else - const int gid1 = get_group_id(1); - #endif - - // Local memory to store a tile of the matrix (for coalescing) - __local realT tile[TRA_WPT*TRA_DIM][TRA_DIM + TRA_PAD]; - - // Loops over the work per thread - #pragma unroll - for (int w_one=0; w_one -// -// This file contains an optimized matrix-multiplication kernel according to the paper by Matsumoto -// et al. and the tutorial on http://www.cedricnugteren.nl/tutorial.php. It is fully configurable -// (and tunable!) using more or less the same parameters/naming conventions as in the paper. It -// supports single and double precision (SGEMM/DGEMM) through a pre-processor define. -// -// Matrices are accessed as follows: -// A: [k*M + m], with 'k' ranging from 0:K and 'm' from 0:M (m,k,m) -// B: [k*N + n], with 'k' ranging from 0:K and 'n' from 0:N (n,k,n) -// C: [n*M + m], with 'n' ranging from 0:N and 'm' from 0:M (m,n,m) -// -// Or as an image (assuming column-major) -// K -// o-------o -// | | -// N | [B^T] | -// | | -// o-------o -// K N -// o-------o o-----o -// M | [A] | M | [C] | -// | | | | -// o-------o o-----o -// -// -// ================================================================================================= - -// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string -// literal). Comment-out this line for syntax-highlighting when developing. -R"( - -// ================================================================================================= - -// Parameters set by the tuner or by the database. Here they are given a basic default value in case -// this kernel file is used outside of the CLBlast library. -#ifndef MWG - #define MWG 8 // Tile-size in dimension M (e.g. 64, 128) -#endif -#ifndef NWG - #define NWG 8 // Tile-size in dimension N (e.g. 64, 128) -#endif -#ifndef KWG - #define KWG 8 // Tile-size in dimension K (e.g. 8, 16) -#endif -#ifndef MDIMC - #define MDIMC 8 // Threads per workgroup in M-dimension (e.g. 8, 16, 32) -#endif -#ifndef NDIMC - #define NDIMC 8 // Threads per workgroup in N-dimension (e.g. 8, 16, 32) -#endif -#ifndef MDIMA - #define MDIMA 8 // Re-shaped tile dimension of matrix A: KDIMA * MDIMA -#endif -#ifndef NDIMB - #define NDIMB 8 // Re-shaped tile dimension of matrix B: KDIMB * NDIMB -#endif -#ifndef KWI - #define KWI 1 // Unroll factor of the KWG loop (smaller or equal than KWG) -#endif -#ifndef VWM - #define VWM 1 // Vector width of matrices A and C -#endif -#ifndef VWN - #define VWN 1 // Vector width of matrix B -#endif -#ifndef STRM - #define STRM 0 // Use strided access within a thread in the M-dimension (1) or not (0) -#endif -#ifndef STRN - #define STRN 0 // Use strided access within a thread in the N-dimension (1) or not (0) -#endif -#ifndef SA - #define SA 0 // Use local/shared memory to cache matrix A (1) or not (0) -#endif -#ifndef SB - #define SB 0 // Use local/shared memory to cache matrix B (1) or not (0) -#endif - -// Helper parameters based on the above tuning parameters -#define MWI (MWG/MDIMC) // Work per work-item (M-dimension) -#define NWI (NWG/NDIMC) // Work per work-item (N-dimension) -#define KDIMA ((MDIMC*NDIMC)/(MDIMA)) // Re-shaped tile dimension of matrix A: KDIMA * MDIMA -#define KDIMB ((MDIMC*NDIMC)/(NDIMB)) // Re-shaped tile dimension of matrix B: KDIMB * NDIMB -#define MWA (MWG/MDIMA) // Amount of loads-per-thread for matrix A (M-dimension) -#define KWA (KWG/KDIMA) // Amount of loads-per-thread for matrix A (K-dimension) -#define KWB (KWG/KDIMB) // Amount of loads-per-thread for matrix B (K-dimension) -#define NWB (NWG/NDIMB) // Amount of loads-per-thread for matrix B (N-dimension) - -// Settings -#define USE_VECTOR_MAD 0 // Unroll (0) or don't (1) unroll the vector MAD manually - -// ================================================================================================= - -// Data-widths in dimension M -#if VWM == 1 - typedef real realM; -#elif VWM == 2 - typedef real2 realM; -#elif VWM == 4 - typedef real4 realM; -#elif VWM == 8 - typedef real8 realM; -#elif VWM == 16 - typedef real16 realM; -#endif - -// Data-widths in dimension N -#if VWN == 1 - typedef real realN; -#elif VWN == 2 - typedef real2 realN; -#elif VWN == 4 - typedef real4 realN; -#elif VWN == 8 - typedef real8 realN; -#elif VWN == 16 - typedef real16 realN; -#endif - -// ================================================================================================= - -// Initializes the accumulation registers to zero -inline void InitAccRegisters(realM cpm[NWI][MWI/VWM]) { - #pragma unroll - for (int mi=0; mi local (matrix A) - #if SA == 1 - GlobalToLocalA(agm, alm, kSizeM, tid, kwg); - #endif - // Loads data: off-chip --> local (matrix B) - #if SB == 1 - GlobalToLocalB(bgm, blm, kSizeN, tid, kwg); - #endif - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - - // Loops over all workitem tiles, unrolled by a factor KWI - for (int pwi=0; pwi private (matrix A) - #if SA == 1 - LocalToPrivateA(alm, apm, kg); - // Loads data: off-chip --> private (matrix A) - #else - GlobalToPrivateA(agm, apm, kSizeM, idk, kwg); - #endif - - // Loads data: local --> private (matrix B) - #if SB == 1 - LocalToPrivateB(blm, bpm, kg); - // Loads data: off-chip --> private (matrix B) - #else - GlobalToPrivateB(bgm, bpm, kSizeN, idk); - #endif - - // Performs the accumulation (Cpm += Apm * Bpm) - MultiplyAccumulate(cpm, apm, bpm); - } - } - #if SA == 1 || SB == 1 - barrier(CLK_LOCAL_MEM_FENCE); - #endif - } -} - -// ================================================================================================= -// The upper-triangular and lower-triangular kernels are only used in special cases -#if defined(ROUTINE_SYRK) || defined(ROUTINE_HERK) || defined(ROUTINE_SYR2K) || defined(ROUTINE_HER2K) - -// Main entry point of the kernel. This is the upper-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmUpper(const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Skip these threads if they do not contain threads contributing to the upper-triangle - if (get_group_id(1)*NWG < get_group_id(0)*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// Main entry point of the kernel. This is the lower-triangular version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void XgemmLower(const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Skip these threads if they do not contain threads contributing to the lower-triangle - if (get_group_id(1)*NWG > get_group_id(0)*MWG) { - return; - } - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeN, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeN, alpha, beta); -} - -// ================================================================================================= -// If not using a triangular version, include the regular kernel -#else - -// Main entry point of the kernel. This is the regular full version. -__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1))) -__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, - const real alpha, const real beta, - const __global realM* restrict agm, - const __global realN* restrict bgm, - __global realM* cgm) { - - // Allocates workgroup-private memory (local memory) - #if SA == 1 - __local realM alm[KWG * MWG/VWM]; - #endif - #if SB == 1 - __local realN blm[KWG * NWG/VWN]; - #endif - - // Computes the matrix-multiplication and stores the result in register memory - realM cpm[NWI][MWI/VWM]; - #if SA == 1 && SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm, blm); - #elif SA == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, alm); - #elif SB == 1 - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm, blm); - #else - XgemmBody(kSizeM, kSizeN, kSizeK, agm, bgm, cgm, cpm); - #endif - - // Stores an MWG * NWG tile of results and performs the multiplication with alpha and beta - StoreResults(cgm, cpm, kSizeM, alpha, beta); -} - -#endif -// ================================================================================================= - -// End of the C++11 raw string literal -)" - -// ================================================================================================= diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 525a82e6..372a407b 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -32,11 +32,11 @@ template Xgemm::Xgemm(Queue &queue, Event &event): Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 29b2f733..11537d20 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -30,11 +30,11 @@ template Xher2k::Xher2k(Queue &queue, Event &event): Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 5174e9ab..3c183083 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -30,11 +30,11 @@ template Xherk::Xherk(Queue &queue, Event &event): Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index b36e7c5e..5b003555 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -32,11 +32,11 @@ template Xsyr2k::Xsyr2k(Queue &queue, Event &event): Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index e4668216..6ae824ba 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -32,11 +32,11 @@ template Xsyrk::Xsyrk(Queue &queue, Event &event): Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc index 23828b25..e2837e60 100644 --- a/src/tuning/copy.cc +++ b/src/tuning/copy.cc @@ -31,7 +31,7 @@ class TuneCopy { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/copy.opencl" + #include "../src/kernels/level3/copy.opencl" ; } diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc index 6a826b6b..72729422 100644 --- a/src/tuning/pad.cc +++ b/src/tuning/pad.cc @@ -31,7 +31,7 @@ class TunePad { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/pad.opencl" + #include "../src/kernels/level3/pad.opencl" ; } diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc index 3f233809..5edd89e0 100644 --- a/src/tuning/padtranspose.cc +++ b/src/tuning/padtranspose.cc @@ -31,7 +31,7 @@ class TunePadTranspose { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/padtranspose.opencl" + #include "../src/kernels/level3/padtranspose.opencl" ; } diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc index 3998ba66..113e0a81 100644 --- a/src/tuning/transpose.cc +++ b/src/tuning/transpose.cc @@ -31,7 +31,7 @@ class TuneTranspose { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/transpose.opencl" + #include "../src/kernels/level3/transpose.opencl" ; } diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc index e820cfb0..c06e3e72 100644 --- a/src/tuning/xgemm.cc +++ b/src/tuning/xgemm.cc @@ -31,7 +31,7 @@ class TuneXgemm { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemm.opencl" + #include "../src/kernels/level3/xgemm.opencl" ; } -- cgit v1.2.3