From 2f0697564fea11bd3f91e4474d766de54ca5ac1b Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Sun, 20 Nov 2016 15:05:42 +0100 Subject: Fixed a bug in the TRMM routine caused by overwriting input data before consuming everything --- CHANGELOG | 1 + src/routines/level3/xtrmm.cpp | 19 ++++++++++++------- src/utilities/buffer_test.hpp | 12 +++++------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 48305f03..b679a435 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,7 @@ Development version (next release) - Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation - Fixed a bug in the tests and samples related to waiting for an invalid event - Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters +- Fixed a bug in the TRMM routine that would overwrite input data before consuming everything - Added support for compilation under Visual Studio 2013 (MSVC++ 12.0) - Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS - Added an option to run tuned kernels multiple times to average execution times diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp index 6bf77cfa..1c1f5f90 100644 --- a/src/routines/level3/xtrmm.cpp +++ b/src/routines/level3/xtrmm.cpp @@ -30,11 +30,11 @@ Xtrmm::Xtrmm(Queue &queue, EventPointer event, const std::string &name): // The main routine template void Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle triangle, - const Transpose a_transpose, const Diagonal diagonal, - const size_t m, const size_t n, - const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + const Transpose a_transpose, const Diagonal diagonal, + const size_t m, const size_t n, + const T alpha, + const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); } @@ -55,6 +55,11 @@ void Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle trian // Determines whether or not the triangular matrix is unit-diagonal auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false; + // Creates a copy of B to avoid overwriting input in GEMM while computing output + const auto b_one = (layout == Layout::kRowMajor) ? m : n; + auto b_buffer_copy = Buffer(context_, b_one*b_ld + b_offset); + b_buffer.CopyTo(queue_, b_one*b_ld + b_offset, b_buffer_copy); + // Temporary buffer for a copy of the triangular matrix auto temp_triangular = Buffer(context_, k*k); @@ -91,7 +96,7 @@ void Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle trian m, n, k, alpha, temp_triangular, 0, k, - b_buffer, b_offset, b_ld, + b_buffer_copy, b_offset, b_ld, static_cast(0.0), b_buffer, b_offset, b_ld); } @@ -102,7 +107,7 @@ void Xtrmm::DoTrmm(const Layout layout, const Side side, const Triangle trian DoGemm(layout, Transpose::kNo, a_transpose, m, n, k, alpha, - b_buffer, b_offset, b_ld, + b_buffer_copy, b_offset, b_ld, temp_triangular, 0, k, static_cast(0.0), b_buffer, b_offset, b_ld); diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp index 9a23e0b7..652ab8c6 100644 --- a/src/utilities/buffer_test.hpp +++ b/src/utilities/buffer_test.hpp @@ -23,7 +23,7 @@ namespace clblast { // Tests matrix 'A' for validity template void TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld) { + const size_t offset, const size_t ld) { if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); @@ -34,7 +34,7 @@ void TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, // Tests matrix 'B' for validity template void TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld) { + const size_t offset, const size_t ld) { if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); @@ -45,7 +45,7 @@ void TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, // Tests matrix 'C' for validity template void TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld) { + const size_t offset, const size_t ld) { if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); } try { const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T); @@ -66,8 +66,7 @@ void TestMatrixAP(const size_t n, const Buffer &buffer, const size_t offset) // Tests vector 'X' for validity template -void TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, - const size_t inc) { +void TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc) { if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); @@ -77,8 +76,7 @@ void TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, // Tests vector 'Y' for validity template -void TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, - const size_t inc) { +void TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, const size_t inc) { if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); } try { const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T); -- cgit v1.2.3