summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG1
-rw-r--r--src/routines/level3/xtrmm.cpp19
-rw-r--r--src/utilities/buffer_test.hpp12
3 files changed, 18 insertions, 14 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 48305f03..b679a435 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,7 @@ Development version (next release)
- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
- Fixed a bug in the tests and samples related to waiting for an invalid event
- Fixed a bug in the SYRK/SYR2K/HERK/HER2K routines that would occur with specific tuning parameters
+- Fixed a bug in the TRMM routine that would overwrite input data before consuming everything
- Added support for compilation under Visual Studio 2013 (MSVC++ 12.0)
- Added an option to set OpenCL compiler options through the env variable CLBLAST_BUILD_OPTIONS
- Added an option to run tuned kernels multiple times to average execution times
diff --git a/src/routines/level3/xtrmm.cpp b/src/routines/level3/xtrmm.cpp
index 6bf77cfa..1c1f5f90 100644
--- a/src/routines/level3/xtrmm.cpp
+++ b/src/routines/level3/xtrmm.cpp
@@ -30,11 +30,11 @@ Xtrmm<T>::Xtrmm(Queue &queue, EventPointer event, const std::string &name):
// The main routine
template <typename T>
void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle triangle,
- const Transpose a_transpose, const Diagonal diagonal,
- const size_t m, const size_t n,
- const T alpha,
- const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+ const Transpose a_transpose, const Diagonal diagonal,
+ const size_t m, const size_t n,
+ const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0)) { throw BLASError(StatusCode::kInvalidDimension); }
@@ -55,6 +55,11 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
// Determines whether or not the triangular matrix is unit-diagonal
auto unit_diagonal = (diagonal == Diagonal::kUnit) ? true : false;
+ // Creates a copy of B to avoid overwriting input in GEMM while computing output
+ const auto b_one = (layout == Layout::kRowMajor) ? m : n;
+ auto b_buffer_copy = Buffer<T>(context_, b_one*b_ld + b_offset);
+ b_buffer.CopyTo(queue_, b_one*b_ld + b_offset, b_buffer_copy);
+
// Temporary buffer for a copy of the triangular matrix
auto temp_triangular = Buffer<T>(context_, k*k);
@@ -91,7 +96,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
m, n, k,
alpha,
temp_triangular, 0, k,
- b_buffer, b_offset, b_ld,
+ b_buffer_copy, b_offset, b_ld,
static_cast<T>(0.0),
b_buffer, b_offset, b_ld);
}
@@ -102,7 +107,7 @@ void Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle trian
DoGemm(layout, Transpose::kNo, a_transpose,
m, n, k,
alpha,
- b_buffer, b_offset, b_ld,
+ b_buffer_copy, b_offset, b_ld,
temp_triangular, 0, k,
static_cast<T>(0.0),
b_buffer, b_offset, b_ld);
diff --git a/src/utilities/buffer_test.hpp b/src/utilities/buffer_test.hpp
index 9a23e0b7..652ab8c6 100644
--- a/src/utilities/buffer_test.hpp
+++ b/src/utilities/buffer_test.hpp
@@ -23,7 +23,7 @@ namespace clblast {
// Tests matrix 'A' for validity
template <typename T>
void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
+ const size_t offset, const size_t ld) {
if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimA); }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
@@ -34,7 +34,7 @@ void TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
// Tests matrix 'B' for validity
template <typename T>
void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
+ const size_t offset, const size_t ld) {
if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimB); }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
@@ -45,7 +45,7 @@ void TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
// Tests matrix 'C' for validity
template <typename T>
void TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
- const size_t offset, const size_t ld) {
+ const size_t offset, const size_t ld) {
if (ld < one) { throw BLASError(StatusCode::kInvalidLeadDimC); }
try {
const auto required_size = (ld * (two - 1) + one + offset) * sizeof(T);
@@ -66,8 +66,7 @@ void TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset)
// Tests vector 'X' for validity
template <typename T>
-void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
- const size_t inc) {
+void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t inc) {
if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementX); }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);
@@ -77,8 +76,7 @@ void TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
// Tests vector 'Y' for validity
template <typename T>
-void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
- const size_t inc) {
+void TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t inc) {
if (inc == 0) { throw BLASError(StatusCode::kInvalidIncrementY); }
try {
const auto required_size = ((n - 1) * inc + 1 + offset) * sizeof(T);