summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-16 18:07:46 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-16 18:07:46 +0200
commit52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch)
tree087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7 /src
parent39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff)
Added XOMATCOPY routines to perform out-of-place matrix scaling, copying, and/or transposing
Diffstat (limited to 'src')
-rw-r--r--src/clblast.cc56
-rw-r--r--src/clblast_c.cc81
-rw-r--r--src/kernels/level3/copy_fast.opencl44
-rw-r--r--src/kernels/level3/copy_pad.opencl8
-rw-r--r--src/kernels/level3/transpose_fast.opencl46
-rw-r--r--src/kernels/level3/transpose_pad.opencl8
-rw-r--r--src/routine.cc15
-rw-r--r--src/routines/level3/xgemm.cc12
-rw-r--r--src/routines/level3/xher2k.cc18
-rw-r--r--src/routines/level3/xherk.cc12
-rw-r--r--src/routines/level3/xsyr2k.cc12
-rw-r--r--src/routines/level3/xsyrk.cc9
-rw-r--r--src/routines/levelx/xomatcopy.cc103
-rw-r--r--src/tuning/copy_fast.cc4
-rw-r--r--src/tuning/copy_pad.cc4
-rw-r--r--src/tuning/transpose_fast.cc4
-rw-r--r--src/tuning/transpose_pad.cc4
17 files changed, 402 insertions, 38 deletions
diff --git a/src/clblast.cc b/src/clblast.cc
index 07322327..e3df6ede 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -68,6 +68,9 @@
#include "internal/routines/level3/xher2k.h"
#include "internal/routines/level3/xtrmm.h"
+// Extra includes (level-x)
+#include "internal/routines/levelx/xomatcopy.h"
+
namespace clblast {
// =================================================================================================
@@ -2062,6 +2065,59 @@ template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triang
cl_command_queue*, cl_event*);
// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto queue_cpp = Queue(*queue);
+ auto routine = Xomatcopy<T>(queue_cpp, event);
+ auto status = routine.SetUp();
+ if (status != StatusCode::kSuccess) { return status; }
+ return routine.DoOmatcopy(layout, a_transpose,
+ m, n,
+ alpha,
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
+}
+template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const float,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const double,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const float2,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const double2,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
+ const size_t, const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
+
+// =================================================================================================
// Clears the cache of stored binaries
StatusCode ClearCache() { return cache::ClearCache(); }
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index 2aac907a..22cb2192 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -2832,6 +2832,87 @@ StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle tri
}
// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// OMATCOPY
+StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ float2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ double2{alpha.s[0], alpha.s[1]},
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Transpose>(a_transpose),
+ m, n,
+ alpha,
+ a_buffer, a_offset, a_ld,
+ b_buffer, b_offset, b_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
+
+// =================================================================================================
// Clears the cache of stored binaries
StatusCode CLBlastClearCache() {
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index bfbfacd4..09e54e6d 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -38,13 +38,53 @@ R"(
__attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
__kernel void CopyMatrixFast(const int ld,
__global const realC* restrict src,
- __global realC* dest) {
+ __global realC* dest,
+ const __constant real* restrict arg_alpha) {
+ const real alpha = arg_alpha[0];
#pragma unroll
for (int w_one=0; w_one<COPY_WPT; ++w_one) {
const int id_one = get_global_id(0);
const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
const int id = id_two*(ld/COPY_VW) + id_one;
- dest[id] = src[id];
+ realC result;
+ #if COPY_VW == 1
+ Multiply(result, alpha, src[id]);
+ #elif COPY_VW == 2
+ Multiply(result.x, alpha, src[id].x);
+ Multiply(result.y, alpha, src[id].y);
+ #elif COPY_VW == 4
+ Multiply(result.x, alpha, src[id].x);
+ Multiply(result.y, alpha, src[id].y);
+ Multiply(result.z, alpha, src[id].z);
+ Multiply(result.w, alpha, src[id].w);
+ #elif COPY_VW == 8
+ Multiply(result.s0, alpha, src[id].s0);
+ Multiply(result.s1, alpha, src[id].s1);
+ Multiply(result.s2, alpha, src[id].s2);
+ Multiply(result.s3, alpha, src[id].s3);
+ Multiply(result.s4, alpha, src[id].s4);
+ Multiply(result.s5, alpha, src[id].s5);
+ Multiply(result.s6, alpha, src[id].s6);
+ Multiply(result.s7, alpha, src[id].s7);
+ #elif COPY_VW == 16
+ Multiply(result.s0, alpha, src[id].s0);
+ Multiply(result.s1, alpha, src[id].s1);
+ Multiply(result.s2, alpha, src[id].s2);
+ Multiply(result.s3, alpha, src[id].s3);
+ Multiply(result.s4, alpha, src[id].s4);
+ Multiply(result.s5, alpha, src[id].s5);
+ Multiply(result.s6, alpha, src[id].s6);
+ Multiply(result.s7, alpha, src[id].s7);
+ Multiply(result.s8, alpha, src[id].s8);
+ Multiply(result.s9, alpha, src[id].s9);
+ Multiply(result.sA, alpha, src[id].sA);
+ Multiply(result.sB, alpha, src[id].sB);
+ Multiply(result.sC, alpha, src[id].sC);
+ Multiply(result.sD, alpha, src[id].sD);
+ Multiply(result.sE, alpha, src[id].sE);
+ Multiply(result.sF, alpha, src[id].sF);
+ #endif
+ dest[id] = result;;
}
}
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index f211af0f..d276cc60 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -31,7 +31,9 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
+ const __constant real* restrict arg_alpha,
const int do_conjugate) {
+ const real alpha = arg_alpha[0];
// Loops over the work per thread in both dimensions
#pragma unroll
@@ -52,7 +54,7 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
// Stores the value in the destination matrix
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- dest[id_two*dest_ld + id_one + dest_offset] = value;
+ Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value);
}
}
}
@@ -70,8 +72,10 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
+ const __constant real* restrict arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
+ const real alpha = arg_alpha[0];
// Loops over the work per thread in both dimensions
#pragma unroll
@@ -94,7 +98,7 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
if (id_two < dest_two && id_one < dest_one) {
real value = src[id_two*src_ld + id_one + src_offset];
if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
- dest[id_two*dest_ld + id_one + dest_offset] = value;
+ Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value);
}
}
}
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index 08266461..d5c46a30 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -39,7 +39,9 @@ R"(
__attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
__kernel void TransposeMatrixFast(const int ld,
__global const realT* restrict src,
- __global realT* dest) {
+ __global realT* dest,
+ const __constant real* restrict arg_alpha) {
+ const real alpha = arg_alpha[0];
// Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
// way over workgroups, breaking memory-bank dependencies.
@@ -117,12 +119,50 @@ __kernel void TransposeMatrixFast(const int ld,
results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
#endif
- // Stores the results into the destination matrix
+ // Multiplies by alpha and then stores the results into the destination matrix
#pragma unroll
for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+ realT result;
+ #if TRA_WPT == 1
+ Multiply(result, alpha, results[w_two]);
+ #elif TRA_WPT == 2
+ Multiply(result.x, alpha, results[w_two].x);
+ Multiply(result.y, alpha, results[w_two].y);
+ #elif TRA_WPT == 4
+ Multiply(result.x, alpha, results[w_two].x);
+ Multiply(result.y, alpha, results[w_two].y);
+ Multiply(result.z, alpha, results[w_two].z);
+ Multiply(result.w, alpha, results[w_two].w);
+ #elif TRA_WPT == 8
+ Multiply(result.s0, alpha, results[w_two].s0);
+ Multiply(result.s1, alpha, results[w_two].s1);
+ Multiply(result.s2, alpha, results[w_two].s2);
+ Multiply(result.s3, alpha, results[w_two].s3);
+ Multiply(result.s4, alpha, results[w_two].s4);
+ Multiply(result.s5, alpha, results[w_two].s5);
+ Multiply(result.s6, alpha, results[w_two].s6);
+ Multiply(result.s7, alpha, results[w_two].s7);
+ #elif TRA_WPT == 16
+ Multiply(result.s0, alpha, results[w_two].s0);
+ Multiply(result.s1, alpha, results[w_two].s1);
+ Multiply(result.s2, alpha, results[w_two].s2);
+ Multiply(result.s3, alpha, results[w_two].s3);
+ Multiply(result.s4, alpha, results[w_two].s4);
+ Multiply(result.s5, alpha, results[w_two].s5);
+ Multiply(result.s6, alpha, results[w_two].s6);
+ Multiply(result.s7, alpha, results[w_two].s7);
+ Multiply(result.s8, alpha, results[w_two].s8);
+ Multiply(result.s9, alpha, results[w_two].s9);
+ Multiply(result.sA, alpha, results[w_two].sA);
+ Multiply(result.sB, alpha, results[w_two].sB);
+ Multiply(result.sC, alpha, results[w_two].sC);
+ Multiply(result.sD, alpha, results[w_two].sD);
+ Multiply(result.sE, alpha, results[w_two].sE);
+ Multiply(result.sF, alpha, results[w_two].sF);
+ #endif
const int id_one = gid0*TRA_DIM + get_local_id(0);
const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
- dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
+ dest[id_two*(ld/TRA_WPT) + id_one] = result;
}
}
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 38c23346..2de0c7bd 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -31,7 +31,9 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
+ const __constant real* restrict arg_alpha,
const int do_conjugate) {
+ const real alpha = arg_alpha[0];
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -75,7 +77,7 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
- dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+ Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
}
}
}
@@ -93,8 +95,10 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
__global real* dest,
+ const __constant real* restrict arg_alpha,
const int upper, const int lower,
const int diagonal_imag_zero) {
+ const real alpha = arg_alpha[0];
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -143,7 +147,7 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
- dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+ Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
}
}
}
diff --git a/src/routine.cc b/src/routine.cc
index 4b334e60..1cf8bff8 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -302,6 +302,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
+ const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper, const bool lower,
@@ -339,6 +340,10 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
}
}
+ // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+
// Retrieves the kernel from the compiled binary
try {
auto kernel = Kernel(program, kernel_name);
@@ -348,6 +353,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
kernel.SetArgument(0, static_cast<int>(src_ld));
kernel.SetArgument(1, src());
kernel.SetArgument(2, dest());
+ kernel.SetArgument(3, alpha_buffer());
}
else {
kernel.SetArgument(0, static_cast<int>(src_one));
@@ -360,13 +366,14 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
kernel.SetArgument(7, static_cast<int>(dest_ld));
kernel.SetArgument(8, static_cast<int>(dest_offset));
kernel.SetArgument(9, dest());
+ kernel.SetArgument(10, alpha_buffer());
if (do_pad) {
- kernel.SetArgument(10, static_cast<int>(do_conjugate));
+ kernel.SetArgument(11, static_cast<int>(do_conjugate));
}
else {
- kernel.SetArgument(10, static_cast<int>(upper));
- kernel.SetArgument(11, static_cast<int>(lower));
- kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
+ kernel.SetArgument(11, static_cast<int>(upper));
+ kernel.SetArgument(12, static_cast<int>(lower));
+ kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
}
}
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 6fa6a811..42d5f19e 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -145,7 +145,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
- program, true, a_do_transpose, a_conjugate);
+ ConstantOne<T>(), program,
+ true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
@@ -156,7 +157,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- program, true, b_do_transpose, b_conjugate);
+ ConstantOne<T>(), program,
+ true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessB);
}
@@ -167,7 +169,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
- program, true, c_do_transpose, false);
+ ConstantOne<T>(), program,
+ true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
}
@@ -205,7 +208,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
- program, false, c_do_transpose, false);
+ ConstantOne<T>(), program,
+ false, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
}
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index e83d105f..5ec1f8cd 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -132,7 +132,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
- program, true, ab_rotated, ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessA1);
if (ErrorIn(status)) { return status; }
}
@@ -141,7 +142,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
- program, true, ab_rotated, !ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessA2);
if (ErrorIn(status)) { return status; }
}
@@ -150,7 +152,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
- program, true, ab_rotated, ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessB1);
if (ErrorIn(status)) { return status; }
}
@@ -159,7 +162,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
- program, true, ab_rotated, !ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessB2);
if (ErrorIn(status)) { return status; }
}
@@ -170,7 +174,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
@@ -222,7 +227,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, true);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 9ab50dd2..df97a94f 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -124,7 +124,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- program, true, a_rotated, a_conjugate);
+ ConstantOne<T>(), program,
+ true, a_rotated, a_conjugate);
eventWaitList.push_back(eventProcessA);
if (ErrorIn(status)) { return status; }
}
@@ -133,7 +134,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- program, true, a_rotated, b_conjugate);
+ ConstantOne<T>(), program,
+ true, a_rotated, b_conjugate);
eventWaitList.push_back(eventProcessB);
if (ErrorIn(status)) { return status; }
}
@@ -144,7 +146,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
@@ -180,7 +183,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, true);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 49fbe64b..dd7d19fe 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -124,7 +124,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- program, true, ab_rotated, false);
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
@@ -133,7 +134,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- program, true, ab_rotated, false);
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessB);
}
@@ -144,7 +146,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
@@ -193,7 +196,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, false);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index 9913c7ca..b5817b82 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -117,7 +117,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- program, true, a_rotated, false);
+ ConstantOne<T>(), program,
+ true, a_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
@@ -128,7 +129,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
@@ -164,7 +166,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, false);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc
new file mode 100644
index 00000000..77fc445f
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.cc
@@ -0,0 +1,103 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/levelx/xomatcopy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xomatcopy<half>::precision_ = Precision::kHalf;
+template <> const Precision Xomatcopy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xomatcopy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xomatcopy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xomatcopy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
+ Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, precision_) {
+ source_string_ =
+ #include "../../kernels/level3/level3.opencl"
+ #include "../../kernels/level3/copy_fast.opencl"
+ #include "../../kernels/level3/copy_pad.opencl"
+ #include "../../kernels/level3/transpose_fast.opencl"
+ #include "../../kernels/level3/transpose_pad.opencl"
+ ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+ // Makes sure all dimensions are larger than zero
+ if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+ // Determines whether to transpose the matrix A
+ const auto transpose = (a_transpose != Transpose::kNo);
+
+ // In case of complex data-types, the transpose can also become a conjugate transpose
+ const auto conjugate = (a_transpose == Transpose::kConjugate);
+
+ // Computes the dimensions of the two matrices
+ const auto rotated = (layout == Layout::kRowMajor);
+ const auto a_one = (rotated) ? n : m;
+ const auto a_two = (rotated) ? m : n;
+ const auto b_one = (transpose) ? a_two : a_one;
+ const auto b_two = (transpose) ? a_one : a_two;
+
+ // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their
+ // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL
+ // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space.
+ // Also tests that the leading dimensions of:
+ // matrix A cannot be less than N when rotated, or less than M when not-rotated
+ // matrix B cannot be less than M when rotated, or less than N when not-rotated
+ auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+ if (ErrorIn(status)) { return status; }
+ status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+ if (ErrorIn(status)) { return status; }
+
+ // Loads the program from the database
+ const auto program = GetProgramFromCache();
+
+ auto emptyEventList = std::vector<Event>();
+ status = PadCopyTransposeMatrix(event_, emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ b_one, b_two, b_ld, b_offset, b_buffer,
+ alpha, program, false, transpose, conjugate);
+ if (ErrorIn(status)) { return status; }
+
+ return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xomatcopy<half>;
+template class Xomatcopy<float>;
+template class Xomatcopy<double>;
+template class Xomatcopy<float2>;
+template class Xomatcopy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/copy_fast.cc b/src/tuning/copy_fast.cc
index 2da707be..09fdbaba 100644
--- a/src/tuning/copy_fast.cc
+++ b/src/tuning/copy_fast.cc
@@ -37,7 +37,7 @@ class TuneCopy {
}
// The list of arguments relevant for this routine
- static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+ static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }
@@ -86,9 +86,11 @@ class TuneCopy {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
+ auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentOutput(b_mat);
+ tuner.AddArgumentInput(alpha_buffer);
}
// Describes how to compute the performance metrics
diff --git a/src/tuning/copy_pad.cc b/src/tuning/copy_pad.cc
index ec392471..7088b3bf 100644
--- a/src/tuning/copy_pad.cc
+++ b/src/tuning/copy_pad.cc
@@ -37,7 +37,7 @@ class TunePad {
}
// The list of arguments relevant for this routine
- static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+ static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }
@@ -86,6 +86,7 @@ class TunePad {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
+ auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -96,6 +97,7 @@ class TunePad {
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
+ tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(0);
}
diff --git a/src/tuning/transpose_fast.cc b/src/tuning/transpose_fast.cc
index 1a5260c1..3b0bdeb5 100644
--- a/src/tuning/transpose_fast.cc
+++ b/src/tuning/transpose_fast.cc
@@ -37,7 +37,7 @@ class TuneTranspose {
}
// The list of arguments relevant for this routine
- static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+ static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }
@@ -91,9 +91,11 @@ class TuneTranspose {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
+ auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentOutput(b_mat);
+ tuner.AddArgumentInput(alpha_buffer);
}
// Describes how to compute the performance metrics
diff --git a/src/tuning/transpose_pad.cc b/src/tuning/transpose_pad.cc
index 08b52510..b9ab3ffa 100644
--- a/src/tuning/transpose_pad.cc
+++ b/src/tuning/transpose_pad.cc
@@ -37,7 +37,7 @@ class TunePadTranspose {
}
// The list of arguments relevant for this routine
- static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+ static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
// Tests for valid arguments
static void TestValidArguments(const Arguments<T> &) { }
@@ -90,6 +90,7 @@ class TunePadTranspose {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
+ auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -100,6 +101,7 @@ class TunePadTranspose {
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(0);
tuner.AddArgumentOutput(b_mat);
+ tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentScalar(0);
}