summaryrefslogtreecommitdiff
path: root/src/routines
diff options
context:
space:
mode:
Diffstat (limited to 'src/routines')
-rw-r--r--src/routines/level3/xgemm.cc12
-rw-r--r--src/routines/level3/xher2k.cc18
-rw-r--r--src/routines/level3/xherk.cc12
-rw-r--r--src/routines/level3/xsyr2k.cc12
-rw-r--r--src/routines/level3/xsyrk.cc9
-rw-r--r--src/routines/levelx/xomatcopy.cc103
6 files changed, 145 insertions, 21 deletions
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 6fa6a811..42d5f19e 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -145,7 +145,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
- program, true, a_do_transpose, a_conjugate);
+ ConstantOne<T>(), program,
+ true, a_do_transpose, a_conjugate);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
@@ -156,7 +157,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
b_one, b_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- program, true, b_do_transpose, b_conjugate);
+ ConstantOne<T>(), program,
+ true, b_do_transpose, b_conjugate);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessB);
}
@@ -167,7 +169,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
c_one, c_two, c_ld, c_offset, c_buffer,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
- program, true, c_do_transpose, false);
+ ConstantOne<T>(), program,
+ true, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
}
@@ -205,7 +208,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
status = PadCopyTransposeMatrix(event_, eventWaitList,
m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
c_one, c_two, c_ld, c_offset, c_buffer,
- program, false, c_do_transpose, false);
+ ConstantOne<T>(), program,
+ false, c_do_transpose, false);
if (ErrorIn(status)) { return status; }
}
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index e83d105f..5ec1f8cd 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -132,7 +132,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
- program, true, ab_rotated, ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessA1);
if (ErrorIn(status)) { return status; }
}
@@ -141,7 +142,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
- program, true, ab_rotated, !ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessA2);
if (ErrorIn(status)) { return status; }
}
@@ -150,7 +152,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
- program, true, ab_rotated, ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, ab_conjugate);
eventWaitList.push_back(eventProcessB1);
if (ErrorIn(status)) { return status; }
}
@@ -159,7 +162,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
- program, true, ab_rotated, !ab_conjugate);
+ ConstantOne<T>(), program,
+ true, ab_rotated, !ab_conjugate);
eventWaitList.push_back(eventProcessB2);
if (ErrorIn(status)) { return status; }
}
@@ -170,7 +174,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
@@ -222,7 +227,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, true);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 9ab50dd2..df97a94f 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -124,7 +124,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- program, true, a_rotated, a_conjugate);
+ ConstantOne<T>(), program,
+ true, a_rotated, a_conjugate);
eventWaitList.push_back(eventProcessA);
if (ErrorIn(status)) { return status; }
}
@@ -133,7 +134,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- program, true, a_rotated, b_conjugate);
+ ConstantOne<T>(), program,
+ true, a_rotated, b_conjugate);
eventWaitList.push_back(eventProcessB);
if (ErrorIn(status)) { return status; }
}
@@ -144,7 +146,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
eventWaitList.push_back(eventProcessC);
if (ErrorIn(status)) { return status; }
@@ -180,7 +183,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, true);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, true);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 49fbe64b..dd7d19fe 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -124,7 +124,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
ab_one, ab_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- program, true, ab_rotated, false);
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
@@ -133,7 +134,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
ab_one, ab_two, b_ld, b_offset, b_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
- program, true, ab_rotated, false);
+ ConstantOne<T>(), program,
+ true, ab_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessB);
}
@@ -144,7 +146,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
@@ -193,7 +196,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, false);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
// Successfully finished the computation
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index 9913c7ca..b5817b82 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -117,7 +117,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
a_one, a_two, a_ld, a_offset, a_buffer,
n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
- program, true, a_rotated, false);
+ ConstantOne<T>(), program,
+ true, a_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessA);
}
@@ -128,7 +129,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
n, n, c_ld, c_offset, c_buffer,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
- program, true, c_rotated, false);
+ ConstantOne<T>(), program,
+ true, c_rotated, false);
if (ErrorIn(status)) { return status; }
eventWaitList.push_back(eventProcessC);
@@ -164,7 +166,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
status = PadCopyTransposeMatrix(event_, eventWaitList,
n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
n, n, c_ld, c_offset, c_buffer,
- program, false, c_rotated, false, upper, lower, false);
+ ConstantOne<T>(), program,
+ false, c_rotated, false, upper, lower, false);
if (ErrorIn(status)) { return status; }
diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc
new file mode 100644
index 00000000..77fc445f
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.cc
@@ -0,0 +1,103 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/levelx/xomatcopy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xomatcopy<half>::precision_ = Precision::kHalf;
+template <> const Precision Xomatcopy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xomatcopy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xomatcopy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xomatcopy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
+ Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, precision_) {
+ source_string_ =
+ #include "../../kernels/level3/level3.opencl"
+ #include "../../kernels/level3/copy_fast.opencl"
+ #include "../../kernels/level3/copy_pad.opencl"
+ #include "../../kernels/level3/transpose_fast.opencl"
+ #include "../../kernels/level3/transpose_pad.opencl"
+ ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+ // Makes sure all dimensions are larger than zero
+ if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+ // Determines whether to transpose the matrix A
+ const auto transpose = (a_transpose != Transpose::kNo);
+
+ // In case of complex data-types, the transpose can also become a conjugate transpose
+ const auto conjugate = (a_transpose == Transpose::kConjugate);
+
+ // Computes the dimensions of the two matrices
+ const auto rotated = (layout == Layout::kRowMajor);
+ const auto a_one = (rotated) ? n : m;
+ const auto a_two = (rotated) ? m : n;
+ const auto b_one = (transpose) ? a_two : a_one;
+ const auto b_two = (transpose) ? a_one : a_two;
+
+ // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their
+ // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL
+ // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space.
+ // Also tests that the leading dimensions of:
+ // matrix A cannot be less than N when rotated, or less than M when not-rotated
+ // matrix B cannot be less than M when rotated, or less than N when not-rotated
+ auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+ if (ErrorIn(status)) { return status; }
+ status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+ if (ErrorIn(status)) { return status; }
+
+ // Loads the program from the database
+ const auto program = GetProgramFromCache();
+
+ auto emptyEventList = std::vector<Event>();
+ status = PadCopyTransposeMatrix(event_, emptyEventList,
+ a_one, a_two, a_ld, a_offset, a_buffer,
+ b_one, b_two, b_ld, b_offset, b_buffer,
+ alpha, program, false, transpose, conjugate);
+ if (ErrorIn(status)) { return status; }
+
+ return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xomatcopy<half>;
+template class Xomatcopy<float>;
+template class Xomatcopy<double>;
+template class Xomatcopy<float2>;
+template class Xomatcopy<double2>;
+
+// =================================================================================================
+} // namespace clblast