diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-16 18:07:46 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-16 18:07:46 +0200 |
commit | 52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch) | |
tree | 087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7 /src/kernels/level3/copy_fast.opencl | |
parent | 39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff) |
Added XOMATCOPY routines to perform out-of-place matrix scaling, copying, and/or transposing
Diffstat (limited to 'src/kernels/level3/copy_fast.opencl')
-rw-r--r-- | src/kernels/level3/copy_fast.opencl | 44 |
1 files changed, 42 insertions, 2 deletions
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl index bfbfacd4..09e54e6d 100644 --- a/src/kernels/level3/copy_fast.opencl +++ b/src/kernels/level3/copy_fast.opencl @@ -38,13 +38,53 @@ R"( __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1))) __kernel void CopyMatrixFast(const int ld, __global const realC* restrict src, - __global realC* dest) { + __global realC* dest, + const __constant real* restrict arg_alpha) { + const real alpha = arg_alpha[0]; #pragma unroll for (int w_one=0; w_one<COPY_WPT; ++w_one) { const int id_one = get_global_id(0); const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1); const int id = id_two*(ld/COPY_VW) + id_one; - dest[id] = src[id]; + realC result; + #if COPY_VW == 1 + Multiply(result, alpha, src[id]); + #elif COPY_VW == 2 + Multiply(result.x, alpha, src[id].x); + Multiply(result.y, alpha, src[id].y); + #elif COPY_VW == 4 + Multiply(result.x, alpha, src[id].x); + Multiply(result.y, alpha, src[id].y); + Multiply(result.z, alpha, src[id].z); + Multiply(result.w, alpha, src[id].w); + #elif COPY_VW == 8 + Multiply(result.s0, alpha, src[id].s0); + Multiply(result.s1, alpha, src[id].s1); + Multiply(result.s2, alpha, src[id].s2); + Multiply(result.s3, alpha, src[id].s3); + Multiply(result.s4, alpha, src[id].s4); + Multiply(result.s5, alpha, src[id].s5); + Multiply(result.s6, alpha, src[id].s6); + Multiply(result.s7, alpha, src[id].s7); + #elif COPY_VW == 16 + Multiply(result.s0, alpha, src[id].s0); + Multiply(result.s1, alpha, src[id].s1); + Multiply(result.s2, alpha, src[id].s2); + Multiply(result.s3, alpha, src[id].s3); + Multiply(result.s4, alpha, src[id].s4); + Multiply(result.s5, alpha, src[id].s5); + Multiply(result.s6, alpha, src[id].s6); + Multiply(result.s7, alpha, src[id].s7); + Multiply(result.s8, alpha, src[id].s8); + Multiply(result.s9, alpha, src[id].s9); + Multiply(result.sA, alpha, src[id].sA); + Multiply(result.sB, alpha, src[id].sB); + Multiply(result.sC, alpha, src[id].sC); + Multiply(result.sD, alpha, src[id].sD); + Multiply(result.sE, alpha, src[id].sE); + Multiply(result.sF, alpha, src[id].sF); + #endif + dest[id] = result;; } } |