summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-06-16 18:07:46 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-06-16 18:07:46 +0200
commit52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch)
tree087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7 /include
parent39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff)
Added XOMATCOPY routines to perform out-of-place matrix scaling, copying, and/or transposing
Diffstat (limited to 'include')
-rw-r--r--include/clblast.h13
-rw-r--r--include/clblast_c.h36
-rw-r--r--include/internal/routine.h1
-rw-r--r--include/internal/routines/levelx/xomatcopy.h54
4 files changed, 104 insertions, 0 deletions
diff --git a/include/clblast.h b/include/clblast.h
index 64b2610a..31a07423 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -560,6 +560,19 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
cl_command_queue* queue, cl_event* event = nullptr);
// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const T alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 40248615..3ac6d99c 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -1266,6 +1266,42 @@ StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const T
cl_command_queue* queue, cl_event* event);
// =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+StatusCode PUBLIC_API CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const float alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const double alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const cl_float2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const cl_double2 alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+ cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
// CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
// for the same device. This cache can be cleared to free up system memory or in case of debugging.
diff --git a/include/internal/routine.h b/include/internal/routine.h
index d420e2db..35837575 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -83,6 +83,7 @@ class Routine {
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
+ const T alpha,
const Program &program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h
new file mode 100644
index 00000000..38df846e
--- /dev/null
+++ b/include/internal/routines/levelx/xomatcopy.h
@@ -0,0 +1,54 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_ROUTINES_XOMATCOPY_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xomatcopy: public Routine<T> {
+ public:
+
+ // Members and methods from the base class
+ using Routine<T>::source_string_;
+ using Routine<T>::event_;
+ using Routine<T>::GetProgramFromCache;
+ using Routine<T>::PadCopyTransposeMatrix;
+ using Routine<T>::TestMatrixA;
+ using Routine<T>::TestMatrixB;
+ using Routine<T>::ErrorIn;
+
+ // Constructor
+ Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
+
+ // Templated-precision implementation of the routine
+ StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
+ const size_t m, const size_t n, const T alpha,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+
+ private:
+ // Static variable to get the precision
+ const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XOMATCOPY_H_
+#endif