summaryrefslogtreecommitdiff
path: root/src/routines/levelx/xinvert.cpp
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-01-18 21:29:59 +0100
committerCedric Nugteren <web@cedricnugteren.nl>2017-01-18 21:29:59 +0100
commitdf9a77d74d87fb8832264e9e9a37336001873151 (patch)
tree516e113140164daa0d918803dee64b94b685afb6 /src/routines/levelx/xinvert.cpp
parent4b3ffd998904f5c848edc5917308f5942fa71da3 (diff)
Added first version of the TRSM routine based on the diagonal invert kernel
Diffstat (limited to 'src/routines/levelx/xinvert.cpp')
-rw-r--r--src/routines/levelx/xinvert.cpp10
1 files changed, 6 insertions, 4 deletions
diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp
index 5ffba958..ffee9b7c 100644
--- a/src/routines/levelx/xinvert.cpp
+++ b/src/routines/levelx/xinvert.cpp
@@ -27,6 +27,7 @@ namespace clblast {
template <typename T>
Xinvert<T>::Xinvert(Queue &queue, EventPointer event, const std::string &name):
Routine(queue, event, name, {"Invert"}, PrecisionValue<T>(), {}, {
+ #include "../../kernels/level3/level3.opencl"
#include "../../kernels/level3/invert_diagonal_blocks.opencl"
}) {
}
@@ -91,8 +92,9 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
const auto local = std::vector<size_t>{internal_block_size};
const auto global = std::vector<size_t>{num_internal_blocks * internal_block_size};
auto base_kernel_event = Event();
- RunKernel(kernel, queue_, device_, global, local, base_kernel_event.pointer(), event_wait_list);
- event_wait_list.push_back(base_kernel_event);
+ auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer();
+ RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list);
+ if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); }
// Builds up block_size x block_size blocks. For example, internal_block_size=16:
// use 16 x 16 blocks to build 32 x 32 blocks, 1 x (1 x npages) grid, 4 x 4 threads;
@@ -130,8 +132,8 @@ void Xinvert<T>::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle
kernel2.SetArgument(3, static_cast<int>(npages));
kernel2.SetArgument(4, static_cast<int>(block_size));
auto kernel2_event = Event();
- auto eventPointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
- RunKernel(kernel2, queue_, device_, global, local, eventPointer, event_wait_list);
+ auto kernel2_event_pointer = (is_last_kernel) ? event_ : kernel2_event.pointer();
+ RunKernel(kernel2, queue_, device_, global, local, kernel2_event_pointer, event_wait_list);
if (!is_last_kernel) { event_wait_list.push_back(kernel2_event); }
// Exit in case we reach beyond the bounds of the input matrix