From 5702bff5ad579466397f6537dc8925ebd64e3ba3 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 31 May 2018 22:37:06 +0200 Subject: Added error-checking for half-empty local work group sizes; fixed a minor TRSV global worksize issue --- src/routines/common.cpp | 9 +++++++++ src/routines/level2/xtrsv.cpp | 2 +- src/routines/levelx/xinvert.cpp | 6 +++--- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/routines/common.cpp b/src/routines/common.cpp index d3c402bd..695785c4 100644 --- a/src/routines/common.cpp +++ b/src/routines/common.cpp @@ -13,6 +13,7 @@ #include #include +#include #include "routines/common.hpp" @@ -46,6 +47,14 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device, for (auto i=size_t{0}; i::Substitution(const Layout layout, const Triangle triangle, // Launches the kernel const auto local = std::vector{db_["TRSV_BLOCK_SIZE"]}; - const auto global = std::vector{1}; + const auto global = std::vector{Ceil(n, db_["TRSV_BLOCK_SIZE"])}; auto event = Event(); RunKernel(kernel, queue_, device_, global, local, event.pointer()); event.WaitForCompletion(); diff --git a/src/routines/levelx/xinvert.cpp b/src/routines/levelx/xinvert.cpp index 09ef3ec1..99f196ec 100644 --- a/src/routines/levelx/xinvert.cpp +++ b/src/routines/levelx/xinvert.cpp @@ -95,11 +95,11 @@ void Xinvert::InvertMatrixDiagonalBlocks(const Layout layout, const Triangle kernel.SetArgument(5, static_cast(block_size)); kernel.SetArgument(6, static_cast(unit_diagonal)); kernel.SetArgument(7, static_cast(is_upper)); - const auto local = std::vector{internal_block_size}; - const auto global = std::vector{num_internal_blocks * internal_block_size}; + const auto local_invert = std::vector{internal_block_size}; + const auto global_invert = std::vector{num_internal_blocks * internal_block_size}; auto base_kernel_event = Event(); auto base_kernel_event_pointer = (internal_block_size == block_size) ? event_ : base_kernel_event.pointer(); - RunKernel(kernel, queue_, device_, global, local, base_kernel_event_pointer, event_wait_list); + RunKernel(kernel, queue_, device_, global_invert, local_invert, base_kernel_event_pointer, event_wait_list); if (internal_block_size == block_size) { event_wait_list.push_back(base_kernel_event); } // Builds up block_size x block_size blocks. For example, internal_block_size=16: -- cgit v1.2.3