Added XOMATCOPY routines to perform out-of-place matrix scaling, copying, and/or transposing

author: Cedric Nugteren <web@cedricnugteren.nl> 2016-06-16 18:07:46 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2016-06-16 18:07:46 +0200
commit: 52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch)
tree: 087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7
parent: 39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff)
33 files changed, 889 insertions, 64 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 7db4d724..e9063b91 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,6 +11,8 @@ Development version (next release)
   * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN
   * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2
   * Level-3: HGEMM/HSYMM/HSYRK/HSYR2K/HTRMM
+- Added non-BLAS routines:
+  * SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY (matrix copy, scaling, and/or transpose)
 
 Version 0.7.1
 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66547cc0..21982f39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,7 +133,8 @@ set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
 set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
                     xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
 set(LEVEL3_ROUTINES xgemm xsymm xhemm xsyrk xherk xsyr2k xher2k xtrmm)
-set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES})
+set(LEVELX_ROUTINES xomatcopy)
+set(ROUTINES ${LEVEL1_ROUTINES} ${LEVEL2_ROUTINES} ${LEVEL3_ROUTINES} ${LEVELX_ROUTINES})
 set(PRECISIONS 32 64 3232 6464)
 
 # ==================================================================================================
@@ -150,6 +151,9 @@ endforeach()
 foreach(ROUTINE ${LEVEL3_ROUTINES})
   set(SOURCES ${SOURCES} src/routines/level3/${ROUTINE}.cc)
 endforeach()
+foreach(ROUTINE ${LEVELX_ROUTINES})
+  set(SOURCES ${SOURCES} src/routines/levelx/${ROUTINE}.cc)
+endforeach()
 
 # Creates and links the library
 add_library(clblast SHARED ${SOURCES})
@@ -279,6 +283,10 @@ if(CLIENTS)
     add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
                    test/performance/routines/level3/${ROUTINE}.cc)
   endforeach()
+  foreach(ROUTINE ${LEVELX_ROUTINES})
+    add_executable(clblast_client_${ROUTINE} $<TARGET_OBJECTS:test_performance_common>
+                   test/performance/routines/levelx/${ROUTINE}.cc)
+  endforeach()
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_client_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_client_${ROUTINE} DESTINATION bin)
@@ -310,6 +318,10 @@ if(TESTS)
     add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
                    test/correctness/routines/level3/${ROUTINE}.cc)
   endforeach()
+  foreach(ROUTINE ${LEVELX_ROUTINES})
+    add_executable(clblast_test_${ROUTINE} $<TARGET_OBJECTS:test_correctness_common>
+                   test/correctness/routines/levelx/${ROUTINE}.cc)
+  endforeach()
   foreach(ROUTINE ${ROUTINES})
     target_link_libraries(clblast_test_${ROUTINE} clblast ${REF_LIBRARIES} ${OPENCL_LIBRARIES})
     install(TARGETS clblast_test_${ROUTINE} DESTINATION bin)
diff --git a/README.md b/README.md
index 926bc021..26dfb149 100644
--- a/README.md
+++ b/README.md
@@ -235,13 +235,14 @@ CLBlast is in active development but already supports almost all the BLAS routin
 | xHER2K   | - | - | ✔ | ✔ | - |
 | xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
 
-In addition, some non-BLAS routines are also supported by CLBlast. They are experimental and should be used with care:
+In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
 
-| Additional | S | D | C | Z | H |
+| Level-X    | S | D | C | Z | H |
 | -----------|---|---|---|---|---|
 | xSUM       | ✔ | ✔ | ✔ | ✔ | ✔ |
 | IxMAX      | ✔ | ✔ | ✔ | ✔ | ✔ |
 | IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ |
 
 Some BLAS routines are not supported yet by CLBlast. They are shown in the following table:
 
diff --git a/doc/clblast.md b/doc/clblast.md
index 5f337dd3..5105d023 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -2708,3 +2708,76 @@ Requirements for TRMM:
 
 
 
+xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
+-------------
+
+Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.
+
+C++ API:
+```
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const float alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const double alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_float2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_double2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_half alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to OMATCOPY:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for OMATCOPY:
+
+* The value of `a_ld` must be at least `m`.
+* The value of `b_ld` must be at least `n`.
+
+
+
diff --git a/include/clblast.h b/include/clblast.h
index 64b2610a..31a07423 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -560,6 +560,19 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c
                 cl_command_queue* queue, cl_event* event = nullptr);
 
 // =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event = nullptr);
+
+// =================================================================================================
 
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
 // for the same device. This cache can be cleared to free up system memory or in case of debugging.
diff --git a/include/clblast_c.h b/include/clblast_c.h
index 40248615..3ac6d99c 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -1266,6 +1266,42 @@ StatusCode PUBLIC_API CLBlastHtrsm(const Layout layout, const Side side, const T
                                    cl_command_queue* queue, cl_event* event);
 
 // =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+StatusCode PUBLIC_API CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const float alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const double alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const cl_float2 alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const cl_double2 alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+                                       const size_t m, const size_t n,
+                                       const cl_half alpha,
+                                       const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                       cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                       cl_command_queue* queue, cl_event* event);
+
+// =================================================================================================
 
 // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on
 // for the same device. This cache can be cleared to free up system memory or in case of debugging.
diff --git a/include/internal/routine.h b/include/internal/routine.h
index d420e2db..35837575 100644
--- a/include/internal/routine.h
+++ b/include/internal/routine.h
@@ -83,6 +83,7 @@ class Routine {
                                     const size_t dest_one, const size_t dest_two,
                                     const size_t dest_ld, const size_t dest_offset,
                                     const Buffer<T> &dest,
+                                    const T alpha,
                                     const Program &program, const bool do_pad,
                                     const bool do_transpose, const bool do_conjugate,
                                     const bool upper = false, const bool lower = false,
diff --git a/include/internal/routines/levelx/xomatcopy.h b/include/internal/routines/levelx/xomatcopy.h
new file mode 100644
index 00000000..38df846e
--- /dev/null
+++ b/include/internal/routines/levelx/xomatcopy.h
@@ -0,0 +1,54 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy routine. The precision is implemented using a template argument.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_ROUTINES_XOMATCOPY_H_
+
+#include "internal/routine.h"
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class Xomatcopy: public Routine<T> {
+ public:
+
+  // Members and methods from the base class
+  using Routine<T>::source_string_;
+  using Routine<T>::event_;
+  using Routine<T>::GetProgramFromCache;
+  using Routine<T>::PadCopyTransposeMatrix;
+  using Routine<T>::TestMatrixA;
+  using Routine<T>::TestMatrixB;
+  using Routine<T>::ErrorIn;
+
+  // Constructor
+  Xomatcopy(Queue &queue, EventPointer event, const std::string &name = "OMATCOPY");
+
+  // Templated-precision implementation of the routine
+  StatusCode DoOmatcopy(const Layout layout, const Transpose a_transpose,
+                        const size_t m, const size_t n, const T alpha,
+                        const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                        const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld);
+
+ private:
+  // Static variable to get the precision
+  const static Precision precision_;
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_ROUTINES_XOMATCOPY_H_
+#endif
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 7bb66749..6726adda 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -68,6 +68,7 @@ ald_transa_m_k = "When `transpose_a == Transpose::kNo`, then `a_ld` must be at l
 ald_trans_n_k = "When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`."
 ald_side_m_n = "When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`."
 bld_m = "The value of `b_ld` must be at least `m`."
+bld_n = "The value of `b_ld` must be at least `n`."
 bld_transb_k_n = "When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`."
 bld_trans_n_k = "When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`."
 cld_m = "The value of `c_ld` must be at least `m`."
@@ -134,6 +135,9 @@ routines = [
   Routine(True,  True,  "3", "her2k", TU, [Ccs,Zzd],   ["n","k"], ["layout","triangle","ab_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "Rank-2K update of a hermitian matrix", "Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.", [ald_trans_n_k, bld_trans_n_k, cld_n]),
   Routine(True,  True,  "3", "trmm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Triangular matrix-matrix multiplication", "Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.", [ald_side_m_n, bld_m]),
   Routine(False, True,  "3", "trsm",  T,  [S,D,C,Z,H], ["m","n"], ["layout","side","triangle","a_transpose","diagonal"], ["a"], ["b"], ["alpha"], "", "Solves a triangular system of equations", "", []),
+],
+[ # Level X: extra routines (not part of BLAS)
+  Routine(True,  True,  "x", "omatcopy", T,  [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
 ]]
 
 # ==================================================================================================
@@ -148,6 +152,7 @@ def PrecisionToFullName(x):
 	}[x]
 
 # ==================================================================================================
+
 # Separators for the BLAS levels
 separators = ["""
 // =================================================================================================
@@ -160,8 +165,15 @@ separators = ["""
 """
 // =================================================================================================
 // BLAS level-3 (matrix-matrix) routines
+// =================================================================================================""",
+"""
+// =================================================================================================
+// Extra non-BLAS routines (level-X)
 // ================================================================================================="""]
 
+# Names of the level sub-folders
+levelnames = ["1", "2", "3", "x"]
+
 # Main header/footer for source files
 header = """
 // =================================================================================================
@@ -373,7 +385,7 @@ files = [
   path_clblast+"/test/wrapper_clblas.h",
   path_clblast+"/test/wrapper_cblas.h",
 ]
-header_lines = [84, 71, 93, 22, 29, 41]
+header_lines = [84, 74, 93, 22, 29, 41]
 footer_lines = [17, 71, 19, 14, 6, 6]
 
 # Checks whether the command-line arguments are valid; exists otherwise
@@ -396,7 +408,8 @@ for i in xrange(0,len(files)):
 	# Re-writes the body of the file
 	with open(files[i], "w") as f:
 		body = ""
-		for level in [1,2,3]:
+		levels = [1,2,3] if (i == 4 or i == 5) else [1,2,3,4]
+		for level in levels:
 			body += separators[level-1]+"\n"
 			if i == 0:
 				body += clblast_h(routines[level-1])
@@ -417,14 +430,14 @@ for i in xrange(0,len(files)):
 # ==================================================================================================
 
 # Outputs all the correctness-test implementations
-for level in [1,2,3]:
+for level in [1,2,3,4]:
 	for routine in routines[level-1]:
 		if routine.has_tests:
-			filename = path_clblast+"/test/correctness/routines/level"+str(level)+"/x"+routine.name+".cc"
+			filename = path_clblast+"/test/correctness/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
 			with open(filename, "w") as f:
 				body = ""
 				body += "#include \"correctness/testblas.h\"\n"
-				body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
+				body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
 				body += "// Shortcuts to the clblast namespace\n"
 				body += "using float2 = clblast::float2;\n"
 				body += "using double2 = clblast::double2;\n\n"
@@ -443,14 +456,14 @@ for level in [1,2,3]:
 				f.write(footer)
 
 # Outputs all the performance-test implementations
-for level in [1,2,3]:
+for level in [1,2,3,4]:
 	for routine in routines[level-1]:
 		if routine.has_tests:
-			filename = path_clblast+"/test/performance/routines/level"+str(level)+"/x"+routine.name+".cc"
+			filename = path_clblast+"/test/performance/routines/level"+levelnames[level-1]+"/x"+routine.name+".cc"
 			with open(filename, "w") as f:
 				body = ""
 				body += "#include \"performance/client.h\"\n"
-				body += "#include \"routines/level"+str(level)+"/x"+routine.name+".h\"\n\n"
+				body += "#include \"routines/level"+levelnames[level-1]+"/x"+routine.name+".h\"\n\n"
 				body += "// Shortcuts to the clblast namespace\n"
 				body += "using float2 = clblast::float2;\n"
 				body += "using double2 = clblast::double2;\n\n"
@@ -487,7 +500,7 @@ with open(filename, "w") as f:
 	f.write("\n\n")
 
 	# Loops over the routines
-	for level in [1,2,3]:
+	for level in [1,2,3,4]:
 		for routine in routines[level-1]:
 			if routine.implemented:
 
diff --git a/src/clblast.cc b/src/clblast.cc
index 07322327..e3df6ede 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -68,6 +68,9 @@
 #include "internal/routines/level3/xher2k.h"
 #include "internal/routines/level3/xtrmm.h"
 
+// Extra includes (level-x)
+#include "internal/routines/levelx/xomatcopy.h"
+
 namespace clblast {
 
 // =================================================================================================
@@ -2062,6 +2065,59 @@ template StatusCode PUBLIC_API Trsm<half>(const Layout, const Side, const Triang
                                           cl_command_queue*, cl_event*);
 
 // =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event) {
+  auto queue_cpp = Queue(*queue);
+  auto routine = Xomatcopy<T>(queue_cpp, event);
+  auto status = routine.SetUp();
+  if (status != StatusCode::kSuccess) { return status; }
+  return routine.DoOmatcopy(layout, a_transpose,
+                            m, n,
+                            alpha,
+                            Buffer<T>(a_buffer), a_offset, a_ld,
+                            Buffer<T>(b_buffer), b_offset, b_ld);
+}
+template StatusCode PUBLIC_API Omatcopy<float>(const Layout, const Transpose,
+                                               const size_t, const size_t,
+                                               const float,
+                                               const cl_mem, const size_t, const size_t,
+                                               cl_mem, const size_t, const size_t,
+                                               cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<double>(const Layout, const Transpose,
+                                                const size_t, const size_t,
+                                                const double,
+                                                const cl_mem, const size_t, const size_t,
+                                                cl_mem, const size_t, const size_t,
+                                                cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<float2>(const Layout, const Transpose,
+                                                const size_t, const size_t,
+                                                const float2,
+                                                const cl_mem, const size_t, const size_t,
+                                                cl_mem, const size_t, const size_t,
+                                                cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<double2>(const Layout, const Transpose,
+                                                 const size_t, const size_t,
+                                                 const double2,
+                                                 const cl_mem, const size_t, const size_t,
+                                                 cl_mem, const size_t, const size_t,
+                                                 cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Omatcopy<half>(const Layout, const Transpose,
+                                              const size_t, const size_t,
+                                              const half,
+                                              const cl_mem, const size_t, const size_t,
+                                              cl_mem, const size_t, const size_t,
+                                              cl_command_queue*, cl_event*);
+
+// =================================================================================================
 
 // Clears the cache of stored binaries
 StatusCode ClearCache() { return cache::ClearCache(); }
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index 2aac907a..22cb2192 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -2832,6 +2832,87 @@ StatusCode CLBlastHtrsm(const Layout layout, const Side side, const Triangle tri
 }
 
 // =================================================================================================
+// Extra non-BLAS routines (level-X)
+// =================================================================================================
+
+// OMATCOPY
+StatusCode CLBlastSomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const float alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  alpha,
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastDomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const double alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  alpha,
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastComatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_float2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  float2{alpha.s[0], alpha.s[1]},
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastZomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_double2 alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  double2{alpha.s[0], alpha.s[1]},
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+StatusCode CLBlastHomatcopy(const Layout layout, const Transpose a_transpose,
+                            const size_t m, const size_t n,
+                            const cl_half alpha,
+                            const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                            cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                            cl_command_queue* queue, cl_event* event) {
+  auto status = clblast::Omatcopy(static_cast<clblast::Layout>(layout),
+                                  static_cast<clblast::Transpose>(a_transpose),
+                                  m, n,
+                                  alpha,
+                                  a_buffer, a_offset, a_ld,
+                                  b_buffer, b_offset, b_ld,
+                                  queue, event);
+  return static_cast<StatusCode>(status);
+}
+
+// =================================================================================================
 
 // Clears the cache of stored binaries
 StatusCode CLBlastClearCache() {
diff --git a/src/kernels/level3/copy_fast.opencl b/src/kernels/level3/copy_fast.opencl
index bfbfacd4..09e54e6d 100644
--- a/src/kernels/level3/copy_fast.opencl
+++ b/src/kernels/level3/copy_fast.opencl
@@ -38,13 +38,53 @@ R"(
 __attribute__((reqd_work_group_size(COPY_DIMX, COPY_DIMY, 1)))
 __kernel void CopyMatrixFast(const int ld,
                              __global const realC* restrict src,
-                             __global realC* dest) {
+                             __global realC* dest,
+                             const __constant real* restrict arg_alpha) {
+  const real alpha = arg_alpha[0];
   #pragma unroll
   for (int w_one=0; w_one<COPY_WPT; ++w_one) {
     const int id_one = get_global_id(0);
     const int id_two = (get_group_id(1)*COPY_WPT + w_one) * COPY_DIMY + get_local_id(1);
     const int id = id_two*(ld/COPY_VW) + id_one;
-    dest[id] = src[id];
+    realC result;
+    #if COPY_VW == 1
+      Multiply(result, alpha, src[id]);
+    #elif COPY_VW == 2
+      Multiply(result.x, alpha, src[id].x);
+      Multiply(result.y, alpha, src[id].y);
+    #elif COPY_VW == 4
+      Multiply(result.x, alpha, src[id].x);
+      Multiply(result.y, alpha, src[id].y);
+      Multiply(result.z, alpha, src[id].z);
+      Multiply(result.w, alpha, src[id].w);
+    #elif COPY_VW == 8
+      Multiply(result.s0, alpha, src[id].s0);
+      Multiply(result.s1, alpha, src[id].s1);
+      Multiply(result.s2, alpha, src[id].s2);
+      Multiply(result.s3, alpha, src[id].s3);
+      Multiply(result.s4, alpha, src[id].s4);
+      Multiply(result.s5, alpha, src[id].s5);
+      Multiply(result.s6, alpha, src[id].s6);
+      Multiply(result.s7, alpha, src[id].s7);
+    #elif COPY_VW == 16
+      Multiply(result.s0, alpha, src[id].s0);
+      Multiply(result.s1, alpha, src[id].s1);
+      Multiply(result.s2, alpha, src[id].s2);
+      Multiply(result.s3, alpha, src[id].s3);
+      Multiply(result.s4, alpha, src[id].s4);
+      Multiply(result.s5, alpha, src[id].s5);
+      Multiply(result.s6, alpha, src[id].s6);
+      Multiply(result.s7, alpha, src[id].s7);
+      Multiply(result.s8, alpha, src[id].s8);
+      Multiply(result.s9, alpha, src[id].s9);
+      Multiply(result.sA, alpha, src[id].sA);
+      Multiply(result.sB, alpha, src[id].sB);
+      Multiply(result.sC, alpha, src[id].sC);
+      Multiply(result.sD, alpha, src[id].sD);
+      Multiply(result.sE, alpha, src[id].sE);
+      Multiply(result.sF, alpha, src[id].sF);
+    #endif
+    dest[id] = result;;
   }
 }
 
diff --git a/src/kernels/level3/copy_pad.opencl b/src/kernels/level3/copy_pad.opencl
index f211af0f..d276cc60 100644
--- a/src/kernels/level3/copy_pad.opencl
+++ b/src/kernels/level3/copy_pad.opencl
@@ -31,7 +31,9 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
                             const int dest_one, const int dest_two,
                             const int dest_ld, const int dest_offset,
                             __global real* dest,
+                            const __constant real* restrict arg_alpha,
                             const int do_conjugate) {
+  const real alpha = arg_alpha[0];
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -52,7 +54,7 @@ __kernel void CopyPadMatrix(const int src_one, const int src_two,
 
         // Stores the value in the destination matrix
         if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_two*dest_ld + id_one + dest_offset] = value;
+        Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value);
       }
     }
   }
@@ -70,8 +72,10 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
                          const int dest_one, const int dest_two,
                          const int dest_ld, const int dest_offset,
                          __global real* dest,
+                         const __constant real* restrict arg_alpha,
                          const int upper, const int lower,
                          const int diagonal_imag_zero) {
+  const real alpha = arg_alpha[0];
 
   // Loops over the work per thread in both dimensions
   #pragma unroll
@@ -94,7 +98,7 @@ __kernel void CopyMatrix(const int src_one, const int src_two,
         if (id_two < dest_two && id_one < dest_one) {
           real value = src[id_two*src_ld + id_one + src_offset];
           if (diagonal_imag_zero == 1 && id_one == id_two) { ImagToZero(value); }
-          dest[id_two*dest_ld + id_one + dest_offset] = value;
+          Multiply(dest[id_two*dest_ld + id_one + dest_offset], alpha, value);
         }
       }
     }
diff --git a/src/kernels/level3/transpose_fast.opencl b/src/kernels/level3/transpose_fast.opencl
index 08266461..d5c46a30 100644
--- a/src/kernels/level3/transpose_fast.opencl
+++ b/src/kernels/level3/transpose_fast.opencl
@@ -39,7 +39,9 @@ R"(
 __attribute__((reqd_work_group_size(TRA_DIM, TRA_DIM, 1)))
 __kernel void TransposeMatrixFast(const int ld,
                                   __global const realT* restrict src,
-                                  __global realT* dest) {
+                                  __global realT* dest,
+                                  const __constant real* restrict arg_alpha) {
+  const real alpha = arg_alpha[0];
 
   // Sets the group identifiers. They might be 'shuffled' around to distribute work in a different
   // way over workgroups, breaking memory-bank dependencies.
@@ -117,12 +119,50 @@ __kernel void TransposeMatrixFast(const int ld,
     results[15] = (realT) {v[0].sF, v[1].sF, v[2].sF, v[3].sF, v[4].sF, v[5].sF, v[6].sF, v[7].sF, v[8].sF, v[9].sF, v[10].sF, v[11].sF, v[12].sF, v[13].sF, v[14].sF, v[15].sF};
   #endif
 
-  // Stores the results into the destination matrix
+  // Multiplies by alpha and then stores the results into the destination matrix
   #pragma unroll
   for (int w_two=0; w_two<TRA_WPT; ++w_two) {
+    realT result;
+    #if TRA_WPT == 1
+      Multiply(result, alpha, results[w_two]);
+    #elif TRA_WPT == 2
+      Multiply(result.x, alpha, results[w_two].x);
+      Multiply(result.y, alpha, results[w_two].y);
+    #elif TRA_WPT == 4
+      Multiply(result.x, alpha, results[w_two].x);
+      Multiply(result.y, alpha, results[w_two].y);
+      Multiply(result.z, alpha, results[w_two].z);
+      Multiply(result.w, alpha, results[w_two].w);
+    #elif TRA_WPT == 8
+      Multiply(result.s0, alpha, results[w_two].s0);
+      Multiply(result.s1, alpha, results[w_two].s1);
+      Multiply(result.s2, alpha, results[w_two].s2);
+      Multiply(result.s3, alpha, results[w_two].s3);
+      Multiply(result.s4, alpha, results[w_two].s4);
+      Multiply(result.s5, alpha, results[w_two].s5);
+      Multiply(result.s6, alpha, results[w_two].s6);
+      Multiply(result.s7, alpha, results[w_two].s7);
+    #elif TRA_WPT == 16
+      Multiply(result.s0, alpha, results[w_two].s0);
+      Multiply(result.s1, alpha, results[w_two].s1);
+      Multiply(result.s2, alpha, results[w_two].s2);
+      Multiply(result.s3, alpha, results[w_two].s3);
+      Multiply(result.s4, alpha, results[w_two].s4);
+      Multiply(result.s5, alpha, results[w_two].s5);
+      Multiply(result.s6, alpha, results[w_two].s6);
+      Multiply(result.s7, alpha, results[w_two].s7);
+      Multiply(result.s8, alpha, results[w_two].s8);
+      Multiply(result.s9, alpha, results[w_two].s9);
+      Multiply(result.sA, alpha, results[w_two].sA);
+      Multiply(result.sB, alpha, results[w_two].sB);
+      Multiply(result.sC, alpha, results[w_two].sC);
+      Multiply(result.sD, alpha, results[w_two].sD);
+      Multiply(result.sE, alpha, results[w_two].sE);
+      Multiply(result.sF, alpha, results[w_two].sF);
+    #endif
     const int id_one = gid0*TRA_DIM + get_local_id(0);
     const int id_two = (gid1*TRA_DIM + get_local_id(1))*TRA_WPT + w_two;
-    dest[id_two*(ld/TRA_WPT) + id_one] = results[w_two];
+    dest[id_two*(ld/TRA_WPT) + id_one] = result;
   }
 }
 
diff --git a/src/kernels/level3/transpose_pad.opencl b/src/kernels/level3/transpose_pad.opencl
index 38c23346..2de0c7bd 100644
--- a/src/kernels/level3/transpose_pad.opencl
+++ b/src/kernels/level3/transpose_pad.opencl
@@ -31,7 +31,9 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
                                  const int dest_one, const int dest_two,
                                  const int dest_ld, const int dest_offset,
                                  __global real* dest,
+                                 const __constant real* restrict arg_alpha,
                                  const int do_conjugate) {
+  const real alpha = arg_alpha[0];
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -75,7 +77,7 @@ __kernel void TransposePadMatrix(const int src_one, const int src_two,
       if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
         real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
         if (do_conjugate == 1) { COMPLEX_CONJUGATE(value); }
-        dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+        Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
       }
     }
   }
@@ -93,8 +95,10 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
                               const int dest_one, const int dest_two,
                               const int dest_ld, const int dest_offset,
                               __global real* dest,
+                              const __constant real* restrict arg_alpha,
                               const int upper, const int lower,
                               const int diagonal_imag_zero) {
+  const real alpha = arg_alpha[0];
 
   // Local memory to store a tile of the matrix (for coalescing)
   __local real tile[PADTRA_WPT*PADTRA_TILE][PADTRA_WPT*PADTRA_TILE + PADTRA_PAD];
@@ -143,7 +147,7 @@ __kernel void TransposeMatrix(const int src_one, const int src_two,
         if ((id_dest_one < dest_one) && (id_dest_two < dest_two)) {
           real value = tile[get_local_id(0)*PADTRA_WPT + w_two][get_local_id(1)*PADTRA_WPT + w_one];
           if (diagonal_imag_zero == 1 && id_dest_one == id_dest_two) { ImagToZero(value); }
-          dest[id_dest_two*dest_ld + id_dest_one + dest_offset] = value;
+          Multiply(dest[id_dest_two*dest_ld + id_dest_one + dest_offset], alpha, value);
         }
       }
     }
diff --git a/src/routine.cc b/src/routine.cc
index 4b334e60..1cf8bff8 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -302,6 +302,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
                                               const size_t dest_one, const size_t dest_two,
                                               const size_t dest_ld, const size_t dest_offset,
                                               const Buffer<T> &dest,
+                                              const T alpha,
                                               const Program &program, const bool do_pad,
                                               const bool do_transpose, const bool do_conjugate,
                                               const bool upper, const bool lower,
@@ -339,6 +340,10 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
     }
   }
 
+  // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+  auto alpha_buffer = Buffer<T>(context_, 1);
+  alpha_buffer.Write(queue_, 1, &alpha);
+
   // Retrieves the kernel from the compiled binary
   try {
     auto kernel = Kernel(program, kernel_name);
@@ -348,6 +353,7 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
       kernel.SetArgument(0, static_cast<int>(src_ld));
       kernel.SetArgument(1, src());
       kernel.SetArgument(2, dest());
+      kernel.SetArgument(3, alpha_buffer());
     }
     else {
       kernel.SetArgument(0, static_cast<int>(src_one));
@@ -360,13 +366,14 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev
       kernel.SetArgument(7, static_cast<int>(dest_ld));
       kernel.SetArgument(8, static_cast<int>(dest_offset));
       kernel.SetArgument(9, dest());
+      kernel.SetArgument(10, alpha_buffer());
       if (do_pad) {
-        kernel.SetArgument(10, static_cast<int>(do_conjugate));
+        kernel.SetArgument(11, static_cast<int>(do_conjugate));
       }
       else {
-        kernel.SetArgument(10, static_cast<int>(upper));
-        kernel.SetArgument(11, static_cast<int>(lower));
-        kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
+        kernel.SetArgument(11, static_cast<int>(upper));
+        kernel.SetArgument(12, static_cast<int>(lower));
+        kernel.SetArgument(13, static_cast<int>(diagonal_imag_zero));
       }
     }
 
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 6fa6a811..42d5f19e 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -145,7 +145,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       m_ceiled, k_ceiled, m_ceiled, 0, a_temp,
-                                      program, true, a_do_transpose, a_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, a_do_transpose, a_conjugate);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessA);
     }
@@ -156,7 +157,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
                                       b_one, b_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      program, true, b_do_transpose, b_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, b_do_transpose, b_conjugate);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessB);
     }
@@ -167,7 +169,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
       status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
                                       c_one, c_two, c_ld, c_offset, c_buffer,
                                       m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
-                                      program, true, c_do_transpose, false);
+                                      ConstantOne<T>(), program,
+                                      true, c_do_transpose, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessC);
     }
@@ -205,7 +208,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
         status = PadCopyTransposeMatrix(event_, eventWaitList,
                                         m_ceiled, n_ceiled, m_ceiled, 0, c_temp,
                                         c_one, c_two, c_ld, c_offset, c_buffer,
-                                        program, false, c_do_transpose, false);
+                                        ConstantOne<T>(), program,
+                                        false, c_do_transpose, false);
         if (ErrorIn(status)) { return status; }
       }
 
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index e83d105f..5ec1f8cd 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -132,7 +132,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       status = PadCopyTransposeMatrix(eventProcessA1.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a1_temp,
-                                      program, true, ab_rotated, ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, ab_conjugate);
       eventWaitList.push_back(eventProcessA1);
       if (ErrorIn(status)) { return status; }
     }
@@ -141,7 +142,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       status = PadCopyTransposeMatrix(eventProcessA2.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a2_temp,
-                                      program, true, ab_rotated, !ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, !ab_conjugate);
       eventWaitList.push_back(eventProcessA2);
       if (ErrorIn(status)) { return status; }
     }
@@ -150,7 +152,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       status = PadCopyTransposeMatrix(eventProcessB1.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b1_temp,
-                                      program, true, ab_rotated, ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, ab_conjugate);
       eventWaitList.push_back(eventProcessB1);
       if (ErrorIn(status)) { return status; }
     }
@@ -159,7 +162,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       status = PadCopyTransposeMatrix(eventProcessB2.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b2_temp,
-                                      program, true, ab_rotated, !ab_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, !ab_conjugate);
       eventWaitList.push_back(eventProcessB2);
       if (ErrorIn(status)) { return status; }
     }
@@ -170,7 +174,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
     status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     eventWaitList.push_back(eventProcessC);
     if (ErrorIn(status)) { return status; }
 
@@ -222,7 +227,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
       status = PadCopyTransposeMatrix(event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, true);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, true);
       if (ErrorIn(status)) { return status; }
 
       // Successfully finished the computation
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 9ab50dd2..df97a94f 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -124,7 +124,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      program, true, a_rotated, a_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, a_rotated, a_conjugate);
       eventWaitList.push_back(eventProcessA);
       if (ErrorIn(status)) { return status; }
     }
@@ -133,7 +134,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      program, true, a_rotated, b_conjugate);
+                                      ConstantOne<T>(), program,
+                                      true, a_rotated, b_conjugate);
       eventWaitList.push_back(eventProcessB);
       if (ErrorIn(status)) { return status; }
     }
@@ -144,7 +146,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
     status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     eventWaitList.push_back(eventProcessC);
     if (ErrorIn(status)) { return status; }
 
@@ -180,7 +183,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
       status = PadCopyTransposeMatrix(event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, true);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, true);
       if (ErrorIn(status)) { return status; }
 
       // Successfully finished the computation
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 49fbe64b..dd7d19fe 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -124,7 +124,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
                                       ab_one, ab_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      program, true, ab_rotated, false);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessA);
     }
@@ -133,7 +134,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       status = PadCopyTransposeMatrix(eventProcessB.pointer(), emptyEventList,
                                       ab_one, ab_two, b_ld, b_offset, b_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, b_temp,
-                                      program, true, ab_rotated, false);
+                                      ConstantOne<T>(), program,
+                                      true, ab_rotated, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessB);
     }
@@ -144,7 +146,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
     status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(eventProcessC);
 
@@ -193,7 +196,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
       status = PadCopyTransposeMatrix(event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, false);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, false);
       if (ErrorIn(status)) { return status; }
 
       // Successfully finished the computation
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index 9913c7ca..b5817b82 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -117,7 +117,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       status = PadCopyTransposeMatrix(eventProcessA.pointer(), emptyEventList,
                                       a_one, a_two, a_ld, a_offset, a_buffer,
                                       n_ceiled, k_ceiled, n_ceiled, 0, a_temp,
-                                      program, true, a_rotated, false);
+                                      ConstantOne<T>(), program,
+                                      true, a_rotated, false);
       if (ErrorIn(status)) { return status; }
       eventWaitList.push_back(eventProcessA);
     }
@@ -128,7 +129,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
     status = PadCopyTransposeMatrix(eventProcessC.pointer(), emptyEventList,
                                     n, n, c_ld, c_offset, c_buffer,
                                     n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
-                                    program, true, c_rotated, false);
+                                    ConstantOne<T>(), program,
+                                    true, c_rotated, false);
     if (ErrorIn(status)) { return status; }
     eventWaitList.push_back(eventProcessC);
 
@@ -164,7 +166,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
       status = PadCopyTransposeMatrix(event_, eventWaitList,
                                       n_ceiled, n_ceiled, n_ceiled, 0, c_temp,
                                       n, n, c_ld, c_offset, c_buffer,
-                                      program, false, c_rotated, false, upper, lower, false);
+                                      ConstantOne<T>(), program,
+                                      false, c_rotated, false, upper, lower, false);
       if (ErrorIn(status)) { return status; }
 
 
diff --git a/src/routines/levelx/xomatcopy.cc b/src/routines/levelx/xomatcopy.cc
new file mode 100644
index 00000000..77fc445f
--- /dev/null
+++ b/src/routines/levelx/xomatcopy.cc
@@ -0,0 +1,103 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements the Xomatcopy class (see the header for information about the class).
+//
+// =================================================================================================
+
+#include "internal/routines/levelx/xomatcopy.h"
+
+#include <string>
+#include <vector>
+
+namespace clblast {
+// =================================================================================================
+
+// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xomatcopy<half>::precision_ = Precision::kHalf;
+template <> const Precision Xomatcopy<float>::precision_ = Precision::kSingle;
+template <> const Precision Xomatcopy<double>::precision_ = Precision::kDouble;
+template <> const Precision Xomatcopy<float2>::precision_ = Precision::kComplexSingle;
+template <> const Precision Xomatcopy<double2>::precision_ = Precision::kComplexDouble;
+
+// =================================================================================================
+
+// Constructor: forwards to base class constructor
+template <typename T>
+Xomatcopy<T>::Xomatcopy(Queue &queue, EventPointer event, const std::string &name):
+    Routine<T>(queue, event, name, {"Copy","Pad","Transpose","Padtranspose"}, precision_) {
+  source_string_ =
+    #include "../../kernels/level3/level3.opencl"
+    #include "../../kernels/level3/copy_fast.opencl"
+    #include "../../kernels/level3/copy_pad.opencl"
+    #include "../../kernels/level3/transpose_fast.opencl"
+    #include "../../kernels/level3/transpose_pad.opencl"
+  ;
+}
+
+// =================================================================================================
+
+// The main routine
+template <typename T>
+StatusCode Xomatcopy<T>::DoOmatcopy(const Layout layout, const Transpose a_transpose,
+                                    const size_t m, const size_t n, const T alpha,
+                                    const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+                                    const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
+
+  // Makes sure all dimensions are larger than zero
+  if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
+
+  // Determines whether to transpose the matrix A
+  const auto transpose = (a_transpose != Transpose::kNo);
+
+  // In case of complex data-types, the transpose can also become a conjugate transpose
+  const auto conjugate = (a_transpose == Transpose::kConjugate);
+
+  // Computes the dimensions of the two matrices
+  const auto rotated = (layout == Layout::kRowMajor);
+  const auto a_one = (rotated) ? n : m;
+  const auto a_two = (rotated) ? m : n;
+  const auto b_one = (transpose) ? a_two : a_one;
+  const auto b_two = (transpose) ? a_one : a_two;
+
+  // Tests the matrices for validity, first from a perspective of the OpenCL buffers and their
+  // sizes, and then from a perspective of parameter values (e.g. m, n). Tests whether the OpenCL
+  // buffers are valid and non-zero and whether the OpenCL buffers have sufficient storage space.
+  // Also tests that the leading dimensions of:
+  //    matrix A cannot be less than N when rotated, or less than M when not-rotated
+  //    matrix B cannot be less than M when rotated, or less than N when not-rotated
+  auto status = TestMatrixA(a_one, a_two, a_buffer, a_offset, a_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+  status = TestMatrixB(b_one, b_two, b_buffer, b_offset, b_ld, sizeof(T));
+  if (ErrorIn(status)) { return status; }
+
+  // Loads the program from the database
+  const auto program = GetProgramFromCache();
+
+  auto emptyEventList = std::vector<Event>();
+  status = PadCopyTransposeMatrix(event_, emptyEventList,
+                                  a_one, a_two, a_ld, a_offset, a_buffer,
+                                  b_one, b_two, b_ld, b_offset, b_buffer,
+                                  alpha, program, false, transpose, conjugate);
+  if (ErrorIn(status)) { return status; }
+
+  return StatusCode::kSuccess;
+}
+
+// =================================================================================================
+
+// Compiles the templated class
+template class Xomatcopy<half>;
+template class Xomatcopy<float>;
+template class Xomatcopy<double>;
+template class Xomatcopy<float2>;
+template class Xomatcopy<double2>;
+
+// =================================================================================================
+} // namespace clblast
diff --git a/src/tuning/copy_fast.cc b/src/tuning/copy_fast.cc
index 2da707be..09fdbaba 100644
--- a/src/tuning/copy_fast.cc
+++ b/src/tuning/copy_fast.cc
@@ -37,7 +37,7 @@ class TuneCopy {
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -86,9 +86,11 @@ class TuneCopy {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/copy_pad.cc b/src/tuning/copy_pad.cc
index ec392471..7088b3bf 100644
--- a/src/tuning/copy_pad.cc
+++ b/src/tuning/copy_pad.cc
@@ -37,7 +37,7 @@ class TunePad {
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -86,6 +86,7 @@ class TunePad {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -96,6 +97,7 @@ class TunePad {
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/src/tuning/transpose_fast.cc b/src/tuning/transpose_fast.cc
index 1a5260c1..3b0bdeb5 100644
--- a/src/tuning/transpose_fast.cc
+++ b/src/tuning/transpose_fast.cc
@@ -37,7 +37,7 @@ class TuneTranspose {
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -91,9 +91,11 @@ class TuneTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
   }
 
   // Describes how to compute the performance metrics
diff --git a/src/tuning/transpose_pad.cc b/src/tuning/transpose_pad.cc
index 08b52510..b9ab3ffa 100644
--- a/src/tuning/transpose_pad.cc
+++ b/src/tuning/transpose_pad.cc
@@ -37,7 +37,7 @@ class TunePadTranspose {
   }
 
   // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() { return {kArgM, kArgN}; }
+  static std::vector<std::string> GetOptions() { return {kArgM, kArgN, kArgAlpha}; }
 
   // Tests for valid arguments
   static void TestValidArguments(const Arguments<T> &) { }
@@ -90,6 +90,7 @@ class TunePadTranspose {
                            std::vector<T> &, std::vector<T> &,
                            std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
                            std::vector<T> &) {
+    auto alpha_buffer = std::vector<T>{args.alpha};
     tuner.AddArgumentScalar(static_cast<int>(args.m));
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(static_cast<int>(args.m));
@@ -100,6 +101,7 @@ class TunePadTranspose {
     tuner.AddArgumentScalar(static_cast<int>(args.n));
     tuner.AddArgumentScalar(0);
     tuner.AddArgumentOutput(b_mat);
+    tuner.AddArgumentInput(alpha_buffer);
     tuner.AddArgumentScalar(0);
   }
 
diff --git a/test/correctness/routines/levelx/xomatcopy.cc b/test/correctness/routines/levelx/xomatcopy.cc
new file mode 100644
index 00000000..43021bc9
--- /dev/null
+++ b/test/correctness/routines/levelx/xomatcopy.cc
@@ -0,0 +1,30 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "correctness/testblas.h"
+#include "routines/levelx/xomatcopy.h"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  auto errors = size_t{0};
+  errors += clblast::RunTests<clblast::TestXomatcopy<float>, float, float>(argc, argv, false, "SOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<double>, double, double>(argc, argv, true, "DOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv, true, "COMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv, true, "ZOMATCOPY");
+  errors += clblast::RunTests<clblast::TestXomatcopy<half>, half, half>(argc, argv, true, "HOMATCOPY");
+  if (errors > 0) { return 1; } else { return 0; }
+}
+
+// =================================================================================================
diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc
index e5addc48..9f842d91 100644
--- a/test/correctness/testblas.cc
+++ b/test/correctness/testblas.cc
@@ -174,8 +174,8 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
 template <typename T, typename U>
 void TestBlas<T,U>::TestInvalid(std::vector<Arguments<U>> &test_vector, const std::string &name) {
   if (!PrecisionSupported<T>(device_)) { return; }
-  if (!compare_clblas_) { return; }
-  if (std::is_same<T, half>::value) { return; }
+  if (!compare_clblas_) { return; } // not supported for CPU BLAS routines
+  if (std::is_same<T, half>::value) { return; } // not supported for half-precision
   TestStart("invalid buffer sizes", name);
 
   // Iterates over all the to-be-tested combinations of arguments
diff --git a/test/correctness/testblas.h b/test/correctness/testblas.h
index cce10751..e849466a 100644
--- a/test/correctness/testblas.h
+++ b/test/correctness/testblas.h
@@ -129,6 +129,13 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
     const auto reference_routine2 = C::RunReference2; // CBLAS
   #endif
 
+  // Non-BLAS routines cannot be fully tested
+  if (!silent && C::BLASLevel() == 4) {
+    fprintf(stdout, "\n* NOTE: This non-BLAS routine is tested against a custom implementation,\n");
+    fprintf(stdout, "  not against clBLAS or a CPU BLAS library. Thus, the arguments '-clblas'\n");
+    fprintf(stdout, "  and '-cblas' have no effect.\n");
+  }
+
   // Creates a tester
   auto options = C::GetOptions();
   TestBlas<T,U> tester{argc, argv, silent, name, options,
@@ -176,8 +183,9 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
   auto ap_sizes = std::vector<size_t>{args.ap_size};
 
   // Sets the dimensions of the matrices or vectors depending on the BLAS level
-  auto dimensions = (C::BLASLevel() == 3) ? tester.kMatrixDims :
-                    (C::BLASLevel() == 2) ? tester.kMatrixVectorDims :
+  auto dimensions = (C::BLASLevel() == 4) ? tester.kMatrixDims : // non-BLAS extra routines
+                    (C::BLASLevel() == 3) ? tester.kMatrixDims : // level 3
+                    (C::BLASLevel() == 2) ? tester.kMatrixVectorDims : // level 2
                     tester.kVectorDims; // else: level 1
 
   // For the options relevant to this routine, sets the vectors to proper values
@@ -318,7 +326,9 @@ size_t RunTests(int argc, char *argv[], const bool silent, const std::string &na
               // Runs the tests
               tester.TestRegular(regular_test_vector, case_name);
               #ifdef CLBLAST_REF_CLBLAS
-                tester.TestInvalid(invalid_test_vector, case_name);
+                if (C::BLASLevel() != 4) {
+                  tester.TestInvalid(invalid_test_vector, case_name);
+                }
               #endif
             }
           }
diff --git a/test/performance/client.cc b/test/performance/client.cc
index 5a7226df..4c0c79a6 100644
--- a/test/performance/client.cc
+++ b/test/performance/client.cc
@@ -42,8 +42,10 @@ Client<T,U>::Client(const Routine run_routine,
 // applicable, but are searched for anyway to be able to create one common argument parser. All
 // arguments have a default value in case they are not found.
 template <typename T, typename U>
-Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
-                                         const GetMetric default_b_ld, const GetMetric default_c_ld) {
+Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const size_t level,
+                                         const GetMetric default_a_ld,
+                                         const GetMetric default_b_ld,
+                                         const GetMetric default_c_ld) {
   auto args = Arguments<U>{};
   auto help = std::string{"\n* Options given/available:\n"};
 
@@ -116,6 +118,17 @@ Arguments<U> Client<T,U>::ParseArguments(int argc, char *argv[], const GetMetric
   // which is thus always displayed (unless silence is specified).
   if (!args.silent) { fprintf(stdout, "%s\n", help.c_str()); }
 
+  // Comparison against a non-BLAS routine is not supported
+  if (level == 4) { // level-4 == level-X
+    if (args.compare_clblas != 0 || args.compare_cblas != 0) {
+      if (!args.silent) {
+        fprintf(stdout, "* Disabling clBLAS and CPU BLAS comparisons for this non-BLAS routine\n\n");
+      }
+    }
+    args.compare_clblas = 0;
+    args.compare_cblas = 0;
+  }
+
   // Comparison against clBLAS or a CPU BLAS library is not supported in case of half-precision
   if (args.precision == Precision::kHalf) {
     if (args.compare_clblas != 0 || args.compare_cblas != 0) {
diff --git a/test/performance/client.h b/test/performance/client.h
index 8d0597d7..493a7aed 100644
--- a/test/performance/client.h
+++ b/test/performance/client.h
@@ -53,8 +53,10 @@ class Client {
 
   // Parses all command-line arguments, filling in the arguments structure. If no command-line
   // argument is given for a particular argument, it is filled in with a default value.
-  Arguments<U> ParseArguments(int argc, char *argv[], const GetMetric default_a_ld,
-                              const GetMetric default_b_ld, const GetMetric default_c_ld);
+  Arguments<U> ParseArguments(int argc, char *argv[], const size_t level,
+                              const GetMetric default_a_ld,
+                              const GetMetric default_b_ld,
+                              const GetMetric default_c_ld);
 
   // The main client function, setting-up arguments, matrices, OpenCL buffers, etc. After set-up, it
   // calls the client routines.
@@ -97,14 +99,14 @@ void RunClient(int argc, char *argv[]) {
 
   // Sets the reference to test against
   #ifdef CLBLAST_REF_CLBLAS
-    const auto reference1 = C::RunReference1; // clBLAS when available
+    auto reference1 = C::RunReference1; // clBLAS when available
   #else
-    const auto reference1 = ReferenceNotAvailable<T,U>;
+    auto reference1 = ReferenceNotAvailable<T,U>;
   #endif
   #ifdef CLBLAST_REF_CBLAS
-    const auto reference2 = C::RunReference2; // CBLAS when available
+    auto reference2 = C::RunReference2; // CBLAS when available
   #else
-    const auto reference2 = ReferenceNotAvailable<T,U>;
+    auto reference2 = ReferenceNotAvailable<T,U>;
   #endif
 
   // Creates a new client
@@ -112,7 +114,8 @@ void RunClient(int argc, char *argv[]) {
                             C::GetFlops, C::GetBytes);
 
   // Simple command line argument parser with defaults
-  auto args = client.ParseArguments(argc, argv, C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
+  auto args = client.ParseArguments(argc, argv, C::BLASLevel(),
+                                    C::DefaultLDA, C::DefaultLDB, C::DefaultLDC);
   if (args.print_help) { return; }
 
   // Runs the client
diff --git a/test/performance/routines/levelx/xomatcopy.cc b/test/performance/routines/levelx/xomatcopy.cc
new file mode 100644
index 00000000..851f6ee1
--- /dev/null
+++ b/test/performance/routines/levelx/xomatcopy.cc
@@ -0,0 +1,36 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// =================================================================================================
+
+#include "performance/client.h"
+#include "routines/levelx/xomatcopy.h"
+
+// Shortcuts to the clblast namespace
+using float2 = clblast::float2;
+using double2 = clblast::double2;
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  switch(clblast::GetPrecision(argc, argv, clblast::Precision::kSingle)) {
+    case clblast::Precision::kHalf:
+      clblast::RunClient<clblast::TestXomatcopy<half>, half, half>(argc, argv); break;
+    case clblast::Precision::kSingle:
+      clblast::RunClient<clblast::TestXomatcopy<float>, float, float>(argc, argv); break;
+    case clblast::Precision::kDouble:
+      clblast::RunClient<clblast::TestXomatcopy<double>, double, double>(argc, argv); break;
+    case clblast::Precision::kComplexSingle:
+      clblast::RunClient<clblast::TestXomatcopy<float2>, float2, float2>(argc, argv); break;
+    case clblast::Precision::kComplexDouble:
+      clblast::RunClient<clblast::TestXomatcopy<double2>, double2, double2>(argc, argv); break;
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/test/routines/levelx/xomatcopy.h b/test/routines/levelx/xomatcopy.h
new file mode 100644
index 00000000..c499ba75
--- /dev/null
+++ b/test/routines/levelx/xomatcopy.h
@@ -0,0 +1,164 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements a class with static methods to describe the Xomatcopy routine. Examples of
+// such 'descriptions' are how to calculate the size a of buffer or how to run the routine. These
+// static methods are used by the correctness tester and the performance tester.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
+#define CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
+
+#include <vector>
+#include <string>
+
+namespace clblast {
+// =================================================================================================
+
+// See comment at top of file for a description of the class
+template <typename T>
+class TestXomatcopy {
+ public:
+
+  // The BLAS level: 4 for the extra routines
+  static size_t BLASLevel() { return 4; }
+
+  // The list of arguments relevant for this routine
+  static std::vector<std::string> GetOptions() {
+    return {kArgM, kArgN,
+            kArgLayout, kArgATransp,
+            kArgALeadDim, kArgBLeadDim,
+            kArgAOffset, kArgBOffset,
+            kArgAlpha};
+  }
+
+  // Describes how to obtain the sizes of the buffers
+  static size_t GetSizeA(const Arguments<T> &args) {
+    const auto a_rotated = (args.layout == Layout::kRowMajor);
+    const auto a_two = (a_rotated) ? args.m : args.n;
+    return a_two * args.a_ld + args.a_offset;
+  }
+  static size_t GetSizeB(const Arguments<T> &args) {
+    const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                           (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    const auto b_two = (b_rotated) ? args.n : args.m;
+    return b_two * args.b_ld + args.b_offset;
+  }
+
+  // Describes how to set the sizes of all the buffers
+  static void SetSizes(Arguments<T> &args) {
+    args.a_size = GetSizeA(args);
+    args.b_size = GetSizeB(args);
+  }
+
+  // Describes what the default values of the leading dimensions of the matrices are
+  static size_t DefaultLDA(const Arguments<T> &args) { return args.n; }
+  static size_t DefaultLDB(const Arguments<T> &args) { return args.m; }
+  static size_t DefaultLDC(const Arguments<T> &) { return 1; } // N/A for this routine
+
+  // Describes which omatcopyose options are relevant for this routine
+  using Transposes = std::vector<Transpose>;
+  static Transposes GetATransposes(const Transposes &all) { return all; }
+  static Transposes GetBTransposes(const Transposes &) { return {}; } // N/A for this routine
+
+  // Describes how to run the CLBlast routine
+  static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    auto queue_plain = queue();
+    auto event = cl_event{};
+    auto status = Omatcopy<T>(args.layout, args.a_transpose,
+                              args.m, args.n, args.alpha,
+                              buffers.a_mat(), args.a_offset, args.a_ld,
+                              buffers.b_mat(), args.b_offset, args.b_ld,
+                              &queue_plain, &event);
+    clWaitForEvents(1, &event);
+    return status;
+  }
+
+  // Describes how to run a naive version of the routine (for correctness/performance comparison).
+  // Note that a proper clBLAS or CPU BLAS comparison is not available for non-BLAS routines.
+
+  #ifdef CLBLAST_REF_CLBLAS
+    static StatusCode RunReference1(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+        return RunReference2(args, buffers, queue);
+    }
+  #endif
+
+  #ifdef CLBLAST_REF_CBLAS
+    static StatusCode RunReference2(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+
+      // Data transfer from OpenCL to std::vector
+      std::vector<T> a_mat_cpu(args.a_size, static_cast<T>(0));
+      std::vector<T> b_mat_cpu(args.b_size, static_cast<T>(0));
+      buffers.a_mat.Read(queue, args.a_size, a_mat_cpu);
+      buffers.b_mat.Read(queue, args.b_size, b_mat_cpu);
+
+      // Checking for invalid arguments
+      const auto a_rotated = (args.layout == Layout::kRowMajor);
+      const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                             (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+      const auto a_base = (a_rotated) ? args.a_ld*(args.m-1) + args.n : args.a_ld*(args.n-1) + args.m;
+      const auto b_base = (b_rotated) ? args.b_ld*(args.m-1) + args.n : args.b_ld*(args.n-1) + args.m;
+      if ((args.m == 0) || (args.n == 0)) { return StatusCode::kInvalidDimension; }
+      if ((args.a_ld < args.m && !a_rotated) || (args.a_ld < args.n && a_rotated)) { return StatusCode::kInvalidLeadDimA; }
+      if ((args.b_ld < args.m && !b_rotated) || (args.b_ld < args.n && b_rotated)) { return StatusCode::kInvalidLeadDimB; }
+      if (buffers.a_mat.GetSize() < (a_base + args.a_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryA; }
+      if (buffers.b_mat.GetSize() < (b_base + args.b_offset) * sizeof(T)) { return StatusCode::kInsufficientMemoryB; }
+
+      // Matrix copy, scaling, and/or transpose
+      for (auto id1 = size_t{0}; id1 < args.m; ++id1) {
+        for (auto id2 = size_t{0}; id2 < args.n; ++id2) {
+          const auto a_one = (a_rotated) ? id2 : id1;
+          const auto a_two = (a_rotated) ? id1 : id2;
+          const auto b_one = (b_rotated) ? id2 : id1;
+          const auto b_two = (b_rotated) ? id1 : id2;
+          const auto a_index = a_two * args.a_ld + a_one + args.a_offset;
+          const auto b_index = b_two * args.b_ld + b_one + args.b_offset;
+          b_mat_cpu[b_index] = args.alpha * a_mat_cpu[a_index];
+        }
+      }
+
+      // Data transfer back to OpenCL
+      buffers.b_mat.Write(queue, args.b_size, b_mat_cpu);
+      return StatusCode::kSuccess;
+    }
+  #endif
+
+  // Describes how to download the results of the computation (more importantly: which buffer)
+  static std::vector<T> DownloadResult(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
+    std::vector<T> result(args.b_size, static_cast<T>(0));
+    buffers.b_mat.Read(queue, args.b_size, result);
+    return result;
+  }
+
+  // Describes how to compute the indices of the result buffer
+  static size_t ResultID1(const Arguments<T> &args) { return args.m; }
+  static size_t ResultID2(const Arguments<T> &args) { return args.n; }
+  static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t id2) {
+    const auto b_rotated = (args.layout == Layout::kColMajor && args.a_transpose != Transpose::kNo) ||
+                           (args.layout == Layout::kRowMajor && args.a_transpose == Transpose::kNo);
+    const auto b_one = (b_rotated) ? id2 : id1;
+    const auto b_two = (b_rotated) ? id1 : id2;
+    return b_two * args.b_ld + b_one + args.b_offset;
+  }
+
+  // Describes how to compute performance metrics
+  static size_t GetFlops(const Arguments<T> &args) {
+    return args.m*args.n;
+  }
+  static size_t GetBytes(const Arguments<T> &args) {
+    return (2*args.m*args.n) * sizeof(T);
+  }
+};
+
+// =================================================================================================
+} // namespace clblast
+
+// CLBLAST_TEST_ROUTINES_XOMATCOPY_H_
+#endif
author	Cedric Nugteren <web@cedricnugteren.nl>	2016-06-16 18:07:46 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2016-06-16 18:07:46 +0200
commit	52ccaf5b25e14c9ce032315e5e96b1f27886d481 (patch)
tree	087288b7aebf2a06ffc4e7dcbcd4353f7a3be6a7
parent	39b7dbc5e37829abfbcfb77852b9138b31540b42 (diff)