summaryrefslogtreecommitdiff
path: root/src/kernels/level3/copy_fast.opencl
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-16 18:07:46 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-16 18:07:46 +0200
commit52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch)
tree087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7 /src/kernels/level3/copy_fast.opencl
parent39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff)
Added XOMATCOPY routines to perform out-of-place matrix scaling, copying, and/or transposing
Diffstat (limited to 'src/kernels/level3/copy_fast.opencl')
-rw-r--r--src/kernels/level3/copy_fast.opencl44
1 files changed, 42 insertions, 2 deletions
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index bfbfacd4..09e54e6d 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -38,13 +38,53 @@ R"(
__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
__kernel void CopyMatrixFast(const int ld,
__global const realC* restrict src,
- __global realC* dest) {
+ __global realC* dest,
+ const __constant real* restrict arg_alpha) {
+ const real alpha = arg_alpha[0];
#pragma unroll
for (int w_one=0; w_one<COPY_WPT; ++w_one) {
const int id_one = get_global_id(0);
const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
const int id = id_two*(ld/COPY_VW) + id_one;
- dest[id] = src[id];
+ realC result;
+ #if COPY_VW == 1
+ Multiply(result, alpha, src[id]);
+ #elif COPY_VW == 2
+ Multiply(result.x, alpha, src[id].x);
+ Multiply(result.y, alpha, src[id].y);
+ #elif COPY_VW == 4
+ Multiply(result.x, alpha, src[id].x);
+ Multiply(result.y, alpha, src[id].y);
+ Multiply(result.z, alpha, src[id].z);
+ Multiply(result.w, alpha, src[id].w);
+ #elif COPY_VW == 8
+ Multiply(result.s0, alpha, src[id].s0);
+ Multiply(result.s1, alpha, src[id].s1);
+ Multiply(result.s2, alpha, src[id].s2);
+ Multiply(result.s3, alpha, src[id].s3);
+ Multiply(result.s4, alpha, src[id].s4);
+ Multiply(result.s5, alpha, src[id].s5);
+ Multiply(result.s6, alpha, src[id].s6);
+ Multiply(result.s7, alpha, src[id].s7);
+ #elif COPY_VW == 16
+ Multiply(result.s0, alpha, src[id].s0);
+ Multiply(result.s1, alpha, src[id].s1);
+ Multiply(result.s2, alpha, src[id].s2);
+ Multiply(result.s3, alpha, src[id].s3);
+ Multiply(result.s4, alpha, src[id].s4);
+ Multiply(result.s5, alpha, src[id].s5);
+ Multiply(result.s6, alpha, src[id].s6);
+ Multiply(result.s7, alpha, src[id].s7);
+ Multiply(result.s8, alpha, src[id].s8);
+ Multiply(result.s9, alpha, src[id].s9);
+ Multiply(result.sA, alpha, src[id].sA);
+ Multiply(result.sB, alpha, src[id].sB);
+ Multiply(result.sC, alpha, src[id].sC);
+ Multiply(result.sD, alpha, src[id].sD);
+ Multiply(result.sE, alpha, src[id].sE);
+ Multiply(result.sF, alpha, src[id].sF);
+ #endif
+ dest[id] = result;;
}
}