From 375193fe4e72b320eb63fbc6f98c24714f6970c2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Tue, 3 Oct 2017 21:55:21 +0200 Subject: Gemm in-direct implementation now uses only 1 larger instead of max 3 optional temporary buffers --- src/kernels/level3/xgemm_part3.opencl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'src/kernels/level3/xgemm_part3.opencl') diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl index 3f0d590d..f447677f 100644 --- a/src/kernels/level3/xgemm_part3.opencl +++ b/src/kernels/level3/xgemm_part3.opencl @@ -17,7 +17,7 @@ R"( // ================================================================================================= -// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above. +// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions. INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK, const __global realM* restrict agm, const __global realN* restrict bgm, __global realM* cgm, realM cpm[NWI][MWI/VWM] @@ -192,10 +192,15 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK, const real_arg arg_beta, const __global realM* restrict agm, const __global realN* restrict bgm, - __global realM* cgm) { + __global realM* cgm, + const int b_offset, const int c_offset) { const real alpha = GetRealArg(arg_alpha); const real beta = GetRealArg(arg_beta); + // Adds the offsets (in case of use of a single temporary buffer for A, B, and C) + bgm = &bgm[b_offset]; + cgm = &cgm[c_offset]; + // Allocates workgroup-private memory (local memory) #if SA == 1 __local realM alm[KWG * MWG/VWM]; -- cgit v1.2.3 From b06bc01da90983ce484fded4e1a87f5fcd5c4eca Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sat, 14 Oct 2017 17:13:54 +0200 Subject: Make local memory pointers a define in OpenCL; some fixes to the recently changed transpose kernel code --- src/kernels/common.opencl | 10 ++++-- src/kernels/level3/invert_diagonal_blocks.opencl | 6 ++-- src/kernels/level3/transpose_fast.opencl | 40 ++++++++++++------------ src/kernels/level3/transpose_pad.opencl | 4 +-- src/kernels/level3/xgemm_direct_part1.opencl | 4 +-- src/kernels/level3/xgemm_direct_part2.opencl | 12 +++---- src/kernels/level3/xgemm_direct_part3.opencl | 2 +- src/kernels/level3/xgemm_part1.opencl | 8 ++--- src/kernels/level3/xgemm_part3.opencl | 6 ++-- src/kernels/opencl_to_cuda.h | 1 + 10 files changed, 49 insertions(+), 44 deletions(-) (limited to 'src/kernels/level3/xgemm_part3.opencl') diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl index a34877d9..01c411bc 100644 --- a/src/kernels/common.opencl +++ b/src/kernels/common.opencl @@ -23,8 +23,8 @@ R"( #endif // ================================================================================================= -#ifndef CUDA +#ifndef CUDA // Enable support for double-precision #if PRECISION == 16 #pragma OPENCL EXTENSION cl_khr_fp16: enable @@ -34,7 +34,6 @@ R"( #if PRECISION == 64 || PRECISION == 6464 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif - #endif // Half-precision @@ -120,10 +119,15 @@ R"( #define GetRealArg(x) x #endif +// Pointers to local memory objects (using a define because CUDA doesn't need them) +#ifndef LOCAL_PTR + #define LOCAL_PTR __local +#endif + // ================================================================================================= // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific -// devices, this is enabled (see src/routine.cc). +// devices, this is enabled (see src/routine.cpp). #ifndef USE_CL_MAD #define USE_CL_MAD 0 #endif diff --git a/src/kernels/level3/invert_diagonal_blocks.opencl b/src/kernels/level3/invert_diagonal_blocks.opencl index 93241700..281fdcff 100644 --- a/src/kernels/level3/invert_diagonal_blocks.opencl +++ b/src/kernels/level3/invert_diagonal_blocks.opencl @@ -164,7 +164,7 @@ void InvertDiagonalBlock(int n, __global const real* restrict src, const int src // ================================================================================================= // Triple matrix-multiplication kernel: C = A * B -INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, __local real* blm, int n, +INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, LOCAL_PTR real* blm, int n, __global const real* agm, __global const real* bgm, __global real* cgm, const int lda, const int ldb, const int ldc, int current_size, int num_pages, const int block_size) { @@ -250,7 +250,7 @@ INLINE_FUNC void TripleMatMul(const int size, const bool upper, const int part, // ================================================================================================= // Triple matrix-multiplication kernel part 1: B12 = A12 * B22 (upper) or B21 = A21 * B11 (lower) -INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local real* blm, int n, +INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, LOCAL_PTR real* blm, int n, __global const real* src, const int a_offset, const int lda, __global real* dest, int current_size, int num_pages, const int block_size) { @@ -286,7 +286,7 @@ INLINE_FUNC void TripleMatMulPart1(const int size, const bool upper, __local rea } // Triple matrix-multiplication kernel part 1: B12 = -B11 * B12 (upper) or B21 = -B22 * B21 (lower) -INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, __local real* blm, const int n, +INLINE_FUNC void TripleMatMulPart2(const int size, const bool upper, LOCAL_PTR real* blm, const int n, __global real* dest, int current_size, int num_pages, const int block_size) { // Emulates a 3D grid: NX * (NY * num_pages) diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl index 5f9ba209..37b25d99 100644 --- a/src/kernels/level3/transpose_fast.opencl +++ b/src/kernels/level3/transpose_fast.opencl @@ -87,10 +87,10 @@ void TransposeMatrixFast(const int ld, results[0].x = v[0].x; results[0].y = v[1].x; results[1].x = v[0].y; results[1].y = v[1].y; #elif TRA_WPT == 4 - results[0].x = v[0].x; results[0].y; = v[1].x; results[0].z = v[2].x; results[0].w = v[3].x; - results[1].x = v[0].y; results[1].y; = v[1].y; results[1].z = v[2].y; results[1].w = v[3].y; - results[2].x = v[0].z; results[2].y; = v[1].z; results[2].z = v[2].z; results[2].w = v[3].z; - results[3].x = v[0].w; results[3].y; = v[1].w; results[3].z = v[2].w; results[3].w = v[3].w; + results[0].x = v[0].x; results[0].y = v[1].x; results[0].z = v[2].x; results[0].w = v[3].x; + results[1].x = v[0].y; results[1].y = v[1].y; results[1].z = v[2].y; results[1].w = v[3].y; + results[2].x = v[0].z; results[2].y = v[1].z; results[2].z = v[2].z; results[2].w = v[3].z; + results[3].x = v[0].w; results[3].y = v[1].w; results[3].z = v[2].w; results[3].w = v[3].w; #elif TRA_WPT == 8 results[0].s0 = v[0].s0; results[0].s1 = v[1].s0; results[0].s2 = v[2].s0; results[0].s3 = v[3].s0; results[0].s4 = v[4].s0; results[0].s5 = v[5].s0; results[0].s6 = v[6].s0; results[0].s7 = v[7].s0; results[1].s0 = v[0].s1; results[1].s1 = v[1].s1; results[1].s2 = v[2].s1; results[1].s3 = v[3].s1; results[1].s4 = v[4].s1; results[1].s5 = v[5].s1; results[1].s6 = v[6].s1; results[1].s7 = v[7].s1; @@ -101,22 +101,22 @@ void TransposeMatrixFast(const int ld, results[6].s0 = v[0].s6; results[6].s1 = v[1].s6; results[6].s2 = v[2].s6; results[6].s3 = v[3].s6; results[6].s4 = v[4].s6; results[6].s5 = v[5].s6; results[6].s6 = v[6].s6; results[6].s7 = v[7].s6; results[7].s0 = v[0].s7; results[7].s1 = v[1].s7; results[7].s2 = v[2].s7; results[7].s3 = v[3].s7; results[7].s4 = v[4].s7; results[7].s5 = v[5].s7; results[7].s6 = v[6].s7; results[7].s7 = v[7].s7; #elif TRA_WPT == 16 - results[ 0].s0 = v[0].s0; results[ 0].s1 = v[1].s0; results[ 0].s2 = v[2].s0; results[ 0].s3 = v[3].s0; results[ 0].s4 = v[4].s0; results[ 0].s5 = v[5].s0; results[ 0].s6 = v[6].s0; results[ 0].s7 = v[7].s0; results[ 0].s8 = v[8].s0; results[ 0].s9 = v[9].s0; results[ 0].sA = v[10].s0;, results[ 0].sB = v[11].s0;, results[ 0].sC = v[12].s0;, results[ 0].sD = v[13].s0;, results[ 0].sE = v[14].s0;, results[ 0].sF = v[15].s0; - results[ 1].s0 = v[0].s1; results[ 1].s1 = v[1].s1; results[ 1].s2 = v[2].s1; results[ 1].s3 = v[3].s1; results[ 1].s4 = v[4].s1; results[ 1].s5 = v[5].s1; results[ 1].s6 = v[6].s1; results[ 1].s7 = v[7].s1; results[ 1].s8 = v[8].s1; results[ 1].s9 = v[9].s1; results[ 1].sA = v[10].s1;, results[ 1].sB = v[11].s1;, results[ 1].sC = v[12].s1;, results[ 1].sD = v[13].s1;, results[ 1].sE = v[14].s1;, results[ 1].sF = v[15].s1; - results[ 2].s0 = v[0].s2; results[ 2].s1 = v[1].s2; results[ 2].s2 = v[2].s2; results[ 2].s3 = v[3].s2; results[ 2].s4 = v[4].s2; results[ 2].s5 = v[5].s2; results[ 2].s6 = v[6].s2; results[ 2].s7 = v[7].s2; results[ 2].s8 = v[8].s2; results[ 2].s9 = v[9].s2; results[ 2].sA = v[10].s2;, results[ 2].sB = v[11].s2;, results[ 2].sC = v[12].s2;, results[ 2].sD = v[13].s2;, results[ 2].sE = v[14].s2;, results[ 2].sF = v[15].s2; - results[ 3].s0 = v[0].s3; results[ 3].s1 = v[1].s3; results[ 3].s2 = v[2].s3; results[ 3].s3 = v[3].s3; results[ 3].s4 = v[4].s3; results[ 3].s5 = v[5].s3; results[ 3].s6 = v[6].s3; results[ 3].s7 = v[7].s3; results[ 3].s8 = v[8].s3; results[ 3].s9 = v[9].s3; results[ 3].sA = v[10].s3;, results[ 3].sB = v[11].s3;, results[ 3].sC = v[12].s3;, results[ 3].sD = v[13].s3;, results[ 3].sE = v[14].s3;, results[ 3].sF = v[15].s3; - results[ 4].s0 = v[0].s4; results[ 4].s1 = v[1].s4; results[ 4].s2 = v[2].s4; results[ 4].s3 = v[3].s4; results[ 4].s4 = v[4].s4; results[ 4].s5 = v[5].s4; results[ 4].s6 = v[6].s4; results[ 4].s7 = v[7].s4; results[ 4].s8 = v[8].s4; results[ 4].s9 = v[9].s4; results[ 4].sA = v[10].s4;, results[ 4].sB = v[11].s4;, results[ 4].sC = v[12].s4;, results[ 4].sD = v[13].s4;, results[ 4].sE = v[14].s4;, results[ 4].sF = v[15].s4; - results[ 5].s0 = v[0].s5; results[ 5].s1 = v[1].s5; results[ 5].s2 = v[2].s5; results[ 5].s3 = v[3].s5; results[ 5].s4 = v[4].s5; results[ 5].s5 = v[5].s5; results[ 5].s6 = v[6].s5; results[ 5].s7 = v[7].s5; results[ 5].s8 = v[8].s5; results[ 5].s9 = v[9].s5; results[ 5].sA = v[10].s5;, results[ 5].sB = v[11].s5;, results[ 5].sC = v[12].s5;, results[ 5].sD = v[13].s5;, results[ 5].sE = v[14].s5;, results[ 5].sF = v[15].s5; - results[ 6].s0 = v[0].s6; results[ 6].s1 = v[1].s6; results[ 6].s2 = v[2].s6; results[ 6].s3 = v[3].s6; results[ 6].s4 = v[4].s6; results[ 6].s5 = v[5].s6; results[ 6].s6 = v[6].s6; results[ 6].s7 = v[7].s6; results[ 6].s8 = v[8].s6; results[ 6].s9 = v[9].s6; results[ 6].sA = v[10].s6;, results[ 6].sB = v[11].s6;, results[ 6].sC = v[12].s6;, results[ 6].sD = v[13].s6;, results[ 6].sE = v[14].s6;, results[ 6].sF = v[15].s6; - results[ 7].s0 = v[0].s7; results[ 7].s1 = v[1].s7; results[ 7].s2 = v[2].s7; results[ 7].s3 = v[3].s7; results[ 7].s4 = v[4].s7; results[ 7].s5 = v[5].s7; results[ 7].s6 = v[6].s7; results[ 7].s7 = v[7].s7; results[ 7].s8 = v[8].s7; results[ 7].s9 = v[9].s7; results[ 7].sA = v[10].s7;, results[ 7].sB = v[11].s7;, results[ 7].sC = v[12].s7;, results[ 7].sD = v[13].s7;, results[ 7].sE = v[14].s7;, results[ 7].sF = v[15].s7; - results[ 8].s0 = v[0].s8; results[ 8].s1 = v[1].s8; results[ 8].s2 = v[2].s8; results[ 8].s3 = v[3].s8; results[ 8].s4 = v[4].s8; results[ 8].s5 = v[5].s8; results[ 8].s6 = v[6].s8; results[ 8].s7 = v[7].s8; results[ 8].s8 = v[8].s8; results[ 8].s9 = v[9].s8; results[ 8].sA = v[10].s8;, results[ 8].sB = v[11].s8;, results[ 8].sC = v[12].s8;, results[ 8].sD = v[13].s8;, results[ 8].sE = v[14].s8;, results[ 8].sF = v[15].s8; - results[ 9].s0 = v[0].s9; results[ 9].s1 = v[1].s9; results[ 9].s2 = v[2].s9; results[ 9].s3 = v[3].s9; results[ 9].s4 = v[4].s9; results[ 9].s5 = v[5].s9; results[ 9].s6 = v[6].s9; results[ 9].s7 = v[7].s9; results[ 9].s8 = v[8].s9; results[ 9].s9 = v[9].s9; results[ 9].sA = v[10].s9;, results[ 9].sB = v[11].s9;, results[ 9].sC = v[12].s9;, results[ 9].sD = v[13].s9;, results[ 9].sE = v[14].s9;, results[ 9].sF = v[15].s9; - results[10].s0 = v[0].sA; results[10].s1 = v[1].sA; results[10].s2 = v[2].sA; results[10].s3 = v[3].sA; results[10].s4 = v[4].sA; results[10].s5 = v[5].sA; results[10].s6 = v[6].sA; results[10].s7 = v[7].sA; results[10].s8 = v[8].sA; results[10].s9 = v[9].sA; results[10].sA = v[10].sA;, results[10].sB = v[11].sA;, results[10].sC = v[12].sA;, results[10].sD = v[13].sA;, results[10].sE = v[14].sA;, results[10].sF = v[15].sA; - results[11].s0 = v[0].sB; results[11].s1 = v[1].sB; results[11].s2 = v[2].sB; results[11].s3 = v[3].sB; results[11].s4 = v[4].sB; results[11].s5 = v[5].sB; results[11].s6 = v[6].sB; results[11].s7 = v[7].sB; results[11].s8 = v[8].sB; results[11].s9 = v[9].sB; results[11].sA = v[10].sB;, results[11].sB = v[11].sB;, results[11].sC = v[12].sB;, results[11].sD = v[13].sB;, results[11].sE = v[14].sB;, results[11].sF = v[15].sB; - results[12].s0 = v[0].sC; results[12].s1 = v[1].sC; results[12].s2 = v[2].sC; results[12].s3 = v[3].sC; results[12].s4 = v[4].sC; results[12].s5 = v[5].sC; results[12].s6 = v[6].sC; results[12].s7 = v[7].sC; results[12].s8 = v[8].sC; results[12].s9 = v[9].sC; results[12].sA = v[10].sC;, results[12].sB = v[11].sC;, results[12].sC = v[12].sC;, results[12].sD = v[13].sC;, results[12].sE = v[14].sC;, results[12].sF = v[15].sC; - results[13].s0 = v[0].sD; results[13].s1 = v[1].sD; results[13].s2 = v[2].sD; results[13].s3 = v[3].sD; results[13].s4 = v[4].sD; results[13].s5 = v[5].sD; results[13].s6 = v[6].sD; results[13].s7 = v[7].sD; results[13].s8 = v[8].sD; results[13].s9 = v[9].sD; results[13].sA = v[10].sD;, results[13].sB = v[11].sD;, results[13].sC = v[12].sD;, results[13].sD = v[13].sD;, results[13].sE = v[14].sD;, results[13].sF = v[15].sD; - results[14].s0 = v[0].sE; results[14].s1 = v[1].sE; results[14].s2 = v[2].sE; results[14].s3 = v[3].sE; results[14].s4 = v[4].sE; results[14].s5 = v[5].sE; results[14].s6 = v[6].sE; results[14].s7 = v[7].sE; results[14].s8 = v[8].sE; results[14].s9 = v[9].sE; results[14].sA = v[10].sE;, results[14].sB = v[11].sE;, results[14].sC = v[12].sE;, results[14].sD = v[13].sE;, results[14].sE = v[14].sE;, results[14].sF = v[15].sE; - results[15].s0 = v[0].sF; results[15].s1 = v[1].sF; results[15].s2 = v[2].sF; results[15].s3 = v[3].sF; results[15].s4 = v[4].sF; results[15].s5 = v[5].sF; results[15].s6 = v[6].sF; results[15].s7 = v[7].sF; results[15].s8 = v[8].sF; results[15].s9 = v[9].sF; results[15].sA = v[10].sF;, results[15].sB = v[11].sF;, results[15].sC = v[12].sF;, results[15].sD = v[13].sF;, results[15].sE = v[14].sF;, results[15].sF = v[15].sF; + results[ 0].s0 = v[0].s0; results[ 0].s1 = v[1].s0; results[ 0].s2 = v[2].s0; results[ 0].s3 = v[3].s0; results[ 0].s4 = v[4].s0; results[ 0].s5 = v[5].s0; results[ 0].s6 = v[6].s0; results[ 0].s7 = v[7].s0; results[ 0].s8 = v[8].s0; results[ 0].s9 = v[9].s0; results[ 0].sA = v[10].s0; results[ 0].sB = v[11].s0; results[ 0].sC = v[12].s0; results[ 0].sD = v[13].s0; results[ 0].sE = v[14].s0; results[ 0].sF = v[15].s0; + results[ 1].s0 = v[0].s1; results[ 1].s1 = v[1].s1; results[ 1].s2 = v[2].s1; results[ 1].s3 = v[3].s1; results[ 1].s4 = v[4].s1; results[ 1].s5 = v[5].s1; results[ 1].s6 = v[6].s1; results[ 1].s7 = v[7].s1; results[ 1].s8 = v[8].s1; results[ 1].s9 = v[9].s1; results[ 1].sA = v[10].s1; results[ 1].sB = v[11].s1; results[ 1].sC = v[12].s1; results[ 1].sD = v[13].s1; results[ 1].sE = v[14].s1; results[ 1].sF = v[15].s1; + results[ 2].s0 = v[0].s2; results[ 2].s1 = v[1].s2; results[ 2].s2 = v[2].s2; results[ 2].s3 = v[3].s2; results[ 2].s4 = v[4].s2; results[ 2].s5 = v[5].s2; results[ 2].s6 = v[6].s2; results[ 2].s7 = v[7].s2; results[ 2].s8 = v[8].s2; results[ 2].s9 = v[9].s2; results[ 2].sA = v[10].s2; results[ 2].sB = v[11].s2; results[ 2].sC = v[12].s2; results[ 2].sD = v[13].s2; results[ 2].sE = v[14].s2; results[ 2].sF = v[15].s2; + results[ 3].s0 = v[0].s3; results[ 3].s1 = v[1].s3; results[ 3].s2 = v[2].s3; results[ 3].s3 = v[3].s3; results[ 3].s4 = v[4].s3; results[ 3].s5 = v[5].s3; results[ 3].s6 = v[6].s3; results[ 3].s7 = v[7].s3; results[ 3].s8 = v[8].s3; results[ 3].s9 = v[9].s3; results[ 3].sA = v[10].s3; results[ 3].sB = v[11].s3; results[ 3].sC = v[12].s3; results[ 3].sD = v[13].s3; results[ 3].sE = v[14].s3; results[ 3].sF = v[15].s3; + results[ 4].s0 = v[0].s4; results[ 4].s1 = v[1].s4; results[ 4].s2 = v[2].s4; results[ 4].s3 = v[3].s4; results[ 4].s4 = v[4].s4; results[ 4].s5 = v[5].s4; results[ 4].s6 = v[6].s4; results[ 4].s7 = v[7].s4; results[ 4].s8 = v[8].s4; results[ 4].s9 = v[9].s4; results[ 4].sA = v[10].s4; results[ 4].sB = v[11].s4; results[ 4].sC = v[12].s4; results[ 4].sD = v[13].s4; results[ 4].sE = v[14].s4; results[ 4].sF = v[15].s4; + results[ 5].s0 = v[0].s5; results[ 5].s1 = v[1].s5; results[ 5].s2 = v[2].s5; results[ 5].s3 = v[3].s5; results[ 5].s4 = v[4].s5; results[ 5].s5 = v[5].s5; results[ 5].s6 = v[6].s5; results[ 5].s7 = v[7].s5; results[ 5].s8 = v[8].s5; results[ 5].s9 = v[9].s5; results[ 5].sA = v[10].s5; results[ 5].sB = v[11].s5; results[ 5].sC = v[12].s5; results[ 5].sD = v[13].s5; results[ 5].sE = v[14].s5; results[ 5].sF = v[15].s5; + results[ 6].s0 = v[0].s6; results[ 6].s1 = v[1].s6; results[ 6].s2 = v[2].s6; results[ 6].s3 = v[3].s6; results[ 6].s4 = v[4].s6; results[ 6].s5 = v[5].s6; results[ 6].s6 = v[6].s6; results[ 6].s7 = v[7].s6; results[ 6].s8 = v[8].s6; results[ 6].s9 = v[9].s6; results[ 6].sA = v[10].s6; results[ 6].sB = v[11].s6; results[ 6].sC = v[12].s6; results[ 6].sD = v[13].s6; results[ 6].sE = v[14].s6; results[ 6].sF = v[15].s6; + results[ 7].s0 = v[0].s7; results[ 7].s1 = v[1].s7; results[ 7].s2 = v[2].s7; results[ 7].s3 = v[3].s7; results[ 7].s4 = v[4].s7; results[ 7].s5 = v[5].s7; results[ 7].s6 = v[6].s7; results[ 7].s7 = v[7].s7; results[ 7].s8 = v[8].s7; results[ 7].s9 = v[9].s7; results[ 7].sA = v[10].s7; results[ 7].sB = v[11].s7; results[ 7].sC = v[12].s7; results[ 7].sD = v[13].s7; results[ 7].sE = v[14].s7; results[ 7].sF = v[15].s7; + results[ 8].s0 = v[0].s8; results[ 8].s1 = v[1].s8; results[ 8].s2 = v[2].s8; results[ 8].s3 = v[3].s8; results[ 8].s4 = v[4].s8; results[ 8].s5 = v[5].s8; results[ 8].s6 = v[6].s8; results[ 8].s7 = v[7].s8; results[ 8].s8 = v[8].s8; results[ 8].s9 = v[9].s8; results[ 8].sA = v[10].s8; results[ 8].sB = v[11].s8; results[ 8].sC = v[12].s8; results[ 8].sD = v[13].s8; results[ 8].sE = v[14].s8; results[ 8].sF = v[15].s8; + results[ 9].s0 = v[0].s9; results[ 9].s1 = v[1].s9; results[ 9].s2 = v[2].s9; results[ 9].s3 = v[3].s9; results[ 9].s4 = v[4].s9; results[ 9].s5 = v[5].s9; results[ 9].s6 = v[6].s9; results[ 9].s7 = v[7].s9; results[ 9].s8 = v[8].s9; results[ 9].s9 = v[9].s9; results[ 9].sA = v[10].s9; results[ 9].sB = v[11].s9; results[ 9].sC = v[12].s9; results[ 9].sD = v[13].s9; results[ 9].sE = v[14].s9; results[ 9].sF = v[15].s9; + results[10].s0 = v[0].sA; results[10].s1 = v[1].sA; results[10].s2 = v[2].sA; results[10].s3 = v[3].sA; results[10].s4 = v[4].sA; results[10].s5 = v[5].sA; results[10].s6 = v[6].sA; results[10].s7 = v[7].sA; results[10].s8 = v[8].sA; results[10].s9 = v[9].sA; results[10].sA = v[10].sA; results[10].sB = v[11].sA; results[10].sC = v[12].sA; results[10].sD = v[13].sA; results[10].sE = v[14].sA; results[10].sF = v[15].sA; + results[11].s0 = v[0].sB; results[11].s1 = v[1].sB; results[11].s2 = v[2].sB; results[11].s3 = v[3].sB; results[11].s4 = v[4].sB; results[11].s5 = v[5].sB; results[11].s6 = v[6].sB; results[11].s7 = v[7].sB; results[11].s8 = v[8].sB; results[11].s9 = v[9].sB; results[11].sA = v[10].sB; results[11].sB = v[11].sB; results[11].sC = v[12].sB; results[11].sD = v[13].sB; results[11].sE = v[14].sB; results[11].sF = v[15].sB; + results[12].s0 = v[0].sC; results[12].s1 = v[1].sC; results[12].s2 = v[2].sC; results[12].s3 = v[3].sC; results[12].s4 = v[4].sC; results[12].s5 = v[5].sC; results[12].s6 = v[6].sC; results[12].s7 = v[7].sC; results[12].s8 = v[8].sC; results[12].s9 = v[9].sC; results[12].sA = v[10].sC; results[12].sB = v[11].sC; results[12].sC = v[12].sC; results[12].sD = v[13].sC; results[12].sE = v[14].sC; results[12].sF = v[15].sC; + results[13].s0 = v[0].sD; results[13].s1 = v[1].sD; results[13].s2 = v[2].sD; results[13].s3 = v[3].sD; results[13].s4 = v[4].sD; results[13].s5 = v[5].sD; results[13].s6 = v[6].sD; results[13].s7 = v[7].sD; results[13].s8 = v[8].sD; results[13].s9 = v[9].sD; results[13].sA = v[10].sD; results[13].sB = v[11].sD; results[13].sC = v[12].sD; results[13].sD = v[13].sD; results[13].sE = v[14].sD; results[13].sF = v[15].sD; + results[14].s0 = v[0].sE; results[14].s1 = v[1].sE; results[14].s2 = v[2].sE; results[14].s3 = v[3].sE; results[14].s4 = v[4].sE; results[14].s5 = v[5].sE; results[14].s6 = v[6].sE; results[14].s7 = v[7].sE; results[14].s8 = v[8].sE; results[14].s9 = v[9].sE; results[14].sA = v[10].sE; results[14].sB = v[11].sE; results[14].sC = v[12].sE; results[14].sD = v[13].sE; results[14].sE = v[14].sE; results[14].sF = v[15].sE; + results[15].s0 = v[0].sF; results[15].s1 = v[1].sF; results[15].s2 = v[2].sF; results[15].s3 = v[3].sF; results[15].s4 = v[4].sF; results[15].s5 = v[5].sF; results[15].s6 = v[6].sF; results[15].s7 = v[7].sF; results[15].s8 = v[8].sF; results[15].s9 = v[9].sF; results[15].sA = v[10].sF; results[15].sB = v[11].sF; results[15].sC = v[12].sF; results[15].sD = v[13].sF; results[15].sE = v[14].sF; results[15].sF = v[15].sF; #endif // Multiplies by alpha and then stores the results into the destination matrix diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl index 49c5b9a3..ba9a6a56 100644 --- a/src/kernels/level3/transpose_pad.opencl +++ b/src/kernels/level3/transpose_pad.opencl @@ -24,7 +24,7 @@ R"( // Transposes a matrix from source to destination. The output is padded with zero values in case the // destination matrix dimensions are larger than the transposed source matrix dimensions. -INLINE_FUNC void _TransposePadMatrix(__local real* tile, +INLINE_FUNC void _TransposePadMatrix(LOCAL_PTR real* tile, const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, @@ -105,7 +105,7 @@ void TransposePadMatrix(const int src_one, const int src_two, // Transposes a matrix, while considering possible padding in the source matrix. Data is read from a // padded source matrix, but only the actual data is written back to the transposed destination // matrix. This kernel optionally checks for upper/lower triangular matrices. -INLINE_FUNC void _TransposeMatrix(__local real* tile, +INLINE_FUNC void _TransposeMatrix(LOCAL_PTR real* tile, const int src_one, const int src_two, const int src_ld, const int src_offset, __global const real* restrict src, diff --git a/src/kernels/level3/xgemm_direct_part1.opencl b/src/kernels/level3/xgemm_direct_part1.opencl index 8b650589..7d185224 100644 --- a/src/kernels/level3/xgemm_direct_part1.opencl +++ b/src/kernels/level3/xgemm_direct_part1.opencl @@ -184,7 +184,7 @@ INLINE_FUNC void GlobalToPrivateCheckedB(const __global real* restrict bgms, rea // Caches on-chip local memory into per-thread private memory (registers). This function is specific // for caching the A input matrix. -INLINE_FUNC void LocalToPrivateDirectA(__local real* alm, real apm[MWID], const int kg, +INLINE_FUNC void LocalToPrivateDirectA(LOCAL_PTR real* alm, real apm[MWID], const int kg, const int a_transpose) { #pragma unroll for (int mi=0; mi