summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-06-23 08:09:07 +0200
committerCNugteren <web@cedricnugteren.nl>2015-06-23 08:09:07 +0200
commit20eb3506d63e21725974e16ae392cf0dd4bf4df5 (patch)
tree8742994285f53f94f727b1853404325103104393 /src
parent4c2a166bc5406b194108d3b31238e55ac6b99e3c (diff)
Added a condition to update only lower/upper triangular parts in the un-pad kernels
Diffstat (limited to 'src')
-rw-r--r--src/kernels/pad.opencl14
-rw-r--r--src/kernels/padtranspose.opencl18
-rw-r--r--src/routine.cc10
-rw-r--r--src/routines/xgemm.cc8
4 files changed, 36 insertions, 14 deletions
diff --git a/src/kernels/pad.opencl b/src/kernels/pad.opencl
index 45eaef91..cce0c746 100644
--- a/src/kernels/pad.opencl
+++ b/src/kernels/pad.opencl
@@ -86,7 +86,8 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
- __global real* dest) {
+ __global real* dest,
+ const int upper, const int lower) {
// Loops over the work per thread in both dimensions
#pragma unroll
@@ -95,11 +96,18 @@ __kernel void UnPadMatrix(const int src_one, const int src_two,
#pragma unroll
for (int w_two=0; w_two<PAD_WPTY; ++w_two) {
const int id_two = (get_group_id(1)*PAD_WPTY + w_two) * PAD_DIMY + get_local_id(1);
- if (id_two < dest_two && id_one < dest_one) {
+
+ // Masking in case of triangular matrices: updates only the upper or lower part
+ bool condition = true;
+ if (upper == 1) { condition = (id_two >= id_one); }
+ else if (lower == 1) { condition = (id_two <= id_one); }
+ if (condition) {
// Copies the value into the destination matrix. This is always within bounds of the source
// matrix, as we know that the destination matrix is smaller than the source.
- dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+ if (id_two < dest_two && id_one < dest_one) {
+ dest[id_two*dest_ld + id_one + dest_offset] = src[id_two*src_ld + id_one + src_offset];
+ }
}
}
}
diff --git a/src/kernels/padtranspose.opencl b/src/kernels/padtranspose.opencl
index 2f2aabd6..7e923392 100644
--- a/src/kernels/padtranspose.opencl
+++ b/src/kernels/padtranspose.opencl
@@ -100,7 +100,8 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
__global const real* restrict src,
const int dest_one, const int dest_two,
const int dest_ld, const int dest_offset,
- __global real* dest) {
+ __global real* dest,
+ const int upper, const int lower) {
// Local memory to store a tile of the matrix (for coalescing)
__local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -137,10 +138,17 @@ __kernel void UnPadTransposeMatrix(const int src_one, const int src_two,
const int id_dest_one = (get_group_id(0)*PADTRA_WPT + w_one) * PADTRA_TILE + get_local_id(0);
const int id_dest_two = (get_group_id(1)*PADTRA_WPT + w_two) * PADTRA_TILE + get_local_id(1);
- // Stores the transposed value in the destination matrix
- if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
- real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
- dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+ // Masking in case of triangular matrices: updates only the upper or lower part
+ bool condition = true;
+ if (upper == 1) { condition = (id_dest_one >= id_dest_two); }
+ else if (lower == 1) { condition = (id_dest_one <= id_dest_two); }
+ if (condition) {
+
+ // Stores the transposed value in the destination matrix
+ if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
+ real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
+ dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+ }
}
}
}
diff --git a/src/routine.cc b/src/routine.cc
index a4e0bb37..4b7ece41 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -210,11 +210,13 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
const size_t dest_ld, const size_t dest_offset,
const Buffer &dest,
const bool do_transpose, const bool do_conjugate,
- const bool pad, const Program &program) {
+ const bool pad, const bool upper, const bool lower,
+ const Program &program) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
- (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld);
+ (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
+ (upper == false) && (lower == false);
// Determines the right kernel
auto kernel_name = std::string{};
@@ -267,6 +269,10 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
if (pad) {
kernel.SetArgument(10, static_cast<int>(do_conjugate));
}
+ else {
+ kernel.SetArgument(10, static_cast<int>(upper));
+ kernel.SetArgument(11, static_cast<int>(lower));
+ }
}
// Launches the kernel and returns the error code. Uses global and local thread sizes based on
diff --git a/src/routines/xgemm.cc b/src/routines/xgemm.cc
index 20cd2675..651ebb55 100644
--- a/src/routines/xgemm.cc
+++ b/src/routines/xgemm.cc
@@ -108,18 +108,18 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// them up until they reach a certain multiple of size (kernel parameter dependent).
status = PadCopyTransposeMatrix(a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, temp_a,
- a_do_transpose, a_conjugate, true, program);
+ a_do_transpose, a_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
status = PadCopyTransposeMatrix(b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, temp_b,
- b_do_transpose, b_conjugate, true, program);
+ b_do_transpose, b_conjugate, true, false, false, program);
if (ErrorIn(status)) { return status; }
// Only necessary for matrix C if it used both as input and output
if (beta != static_cast<T>(0)) {
status = PadCopyTransposeMatrix(c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
- c_do_transpose, false, true, program);
+ c_do_transpose, false, true, false, false, program);
if (ErrorIn(status)) { return status; }
}
@@ -151,7 +151,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// Runs the post-processing kernel
status = PadCopyTransposeMatrix(m_ceiled, n_ceiled, m_ceiled, 0, temp_c,
c_one, c_two, c_ld, c_offset, c_buffer,
- c_do_transpose, false, false, program);
+ c_do_transpose, false, false, false, false, program);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation