summaryrefslogtreecommitdiff
path: root/src/kernels/level3/copy_fast.opencl
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-28 22:32:25 +0200
committerGitHub <noreply@github.com>2016-06-28 22:32:25 +0200
commit7c13bacf129291e3e295ecb6e833788477085fa0 (patch)
treed114eeca418444d0b1c70cc9cce983de041235c9 /src/kernels/level3/copy_fast.opencl
parent181eb20bbf15cf11baaf6112b6965050c49dd543 (diff)
parent577f0ee1179014ece853af39d6f0ff0c87316eb3 (diff)
Merge pull request #70 from CNugteren/development
Update to version 0.8.0
Diffstat (limited to 'src/kernels/level3/copy_fast.opencl')
-rw-r--r--src/kernels/level3/copy_fast.opencl96
1 files changed, 96 insertions, 0 deletions
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
new file mode 100644
index 00000000..09e54e6d
--- /dev/null
+++ b/src/kernels/level3/copy_fast.opencl
@@ -0,0 +1,96 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the common kernels shared among different BLAS routines. This file contains
+// kernels to copy matrices.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Data-widths
+#if COPY_VW == 1
+ typedef real realC;
+#elif COPY_VW == 2
+ typedef real2 realC;
+#elif COPY_VW == 4
+ typedef real4 realC;
+#elif COPY_VW == 8
+ typedef real8 realC;
+#elif COPY_VW == 16
+ typedef real16 realC;
+#endif
+
+// =================================================================================================
+
+// Fast copy kernel. Requires 'ld' and the number of threads in dimension 0 to be a multiple of
+// COPY_VW. Also requires both matrices to be of the same dimensions and without offset.
+__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
+__kernel void CopyMatrixFast(const int ld,
+ __global const realC* restrict src,
+ __global realC* dest,
+ const __constant real* restrict arg_alpha) {
+ const real alpha = arg_alpha[0];
+ #pragma unroll
+ for (int w_one=0; w_one<COPY_WPT; ++w_one) {
+ const int id_one = get_global_id(0);
+ const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
+ const int id = id_two*(ld/COPY_VW) + id_one;
+ realC result;
+ #if COPY_VW == 1
+ Multiply(result, alpha, src[id]);
+ #elif COPY_VW == 2
+ Multiply(result.x, alpha, src[id].x);
+ Multiply(result.y, alpha, src[id].y);
+ #elif COPY_VW == 4
+ Multiply(result.x, alpha, src[id].x);
+ Multiply(result.y, alpha, src[id].y);
+ Multiply(result.z, alpha, src[id].z);
+ Multiply(result.w, alpha, src[id].w);
+ #elif COPY_VW == 8
+ Multiply(result.s0, alpha, src[id].s0);
+ Multiply(result.s1, alpha, src[id].s1);
+ Multiply(result.s2, alpha, src[id].s2);
+ Multiply(result.s3, alpha, src[id].s3);
+ Multiply(result.s4, alpha, src[id].s4);
+ Multiply(result.s5, alpha, src[id].s5);
+ Multiply(result.s6, alpha, src[id].s6);
+ Multiply(result.s7, alpha, src[id].s7);
+ #elif COPY_VW == 16
+ Multiply(result.s0, alpha, src[id].s0);
+ Multiply(result.s1, alpha, src[id].s1);
+ Multiply(result.s2, alpha, src[id].s2);
+ Multiply(result.s3, alpha, src[id].s3);
+ Multiply(result.s4, alpha, src[id].s4);
+ Multiply(result.s5, alpha, src[id].s5);
+ Multiply(result.s6, alpha, src[id].s6);
+ Multiply(result.s7, alpha, src[id].s7);
+ Multiply(result.s8, alpha, src[id].s8);
+ Multiply(result.s9, alpha, src[id].s9);
+ Multiply(result.sA, alpha, src[id].sA);
+ Multiply(result.sB, alpha, src[id].sB);
+ Multiply(result.sC, alpha, src[id].sC);
+ Multiply(result.sD, alpha, src[id].sD);
+ Multiply(result.sE, alpha, src[id].sE);
+ Multiply(result.sF, alpha, src[id].sF);
+ #endif
+ dest[id] = result;;
+ }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)"
+
+// =================================================================================================