diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2015-10-17 15:30:06 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2015-10-17 15:30:06 +0200 |
commit | 653feca5644592003345f5832bb81138cc07d783 (patch) | |
tree | 61ebab5da9b8807ad1d085e2a249df39b861db0e | |
parent | 92b4b0d1feaaf92e160fa0342daf4269f24fb4d2 (diff) | |
parent | 0d4091fdfbb6ed29c5b6f73b4d272fdaa01c3ba8 (diff) |
Merge pull request #28 from CNugteren/kernels_reorganization
Kernels re-organization level-3
35 files changed, 72 insertions, 67 deletions
@@ -1,6 +1,7 @@ Development version (next release) - Improved structure and performance of level-2 routines (xSYMV/xHEMV) +- Reduced compilation time of level-3 OpenCL kernels - Added level-1 routines: * SSWAP/DSWAP/CSWAP/ZSWAP * SSCAL/DSCAL/CSCAL/ZSCAL diff --git a/include/internal/routines/level1/xaxpy.h b/include/internal/routines/level1/xaxpy.h index 4b9da890..689cf169 100644 --- a/include/internal/routines/level1/xaxpy.h +++ b/include/internal/routines/level1/xaxpy.h @@ -35,7 +35,7 @@ class Xaxpy: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xaxpy(Queue &queue, Event &event); + Xaxpy(Queue &queue, Event &event, const std::string &name = "AXPY"); // Templated-precision implementation of the routine StatusCode DoAxpy(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xcopy.h b/include/internal/routines/level1/xcopy.h index c71583c5..15f339aa 100644 --- a/include/internal/routines/level1/xcopy.h +++ b/include/internal/routines/level1/xcopy.h @@ -35,7 +35,7 @@ class Xcopy: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xcopy(Queue &queue, Event &event); + Xcopy(Queue &queue, Event &event, const std::string &name = "COPY"); // Templated-precision implementation of the routine StatusCode DoCopy(const size_t n, diff --git a/include/internal/routines/level1/xscal.h b/include/internal/routines/level1/xscal.h index 0aa6059d..d97b5a07 100644 --- a/include/internal/routines/level1/xscal.h +++ b/include/internal/routines/level1/xscal.h @@ -34,7 +34,7 @@ class Xscal: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xscal(Queue &queue, Event &event); + Xscal(Queue &queue, Event &event, const std::string &name = "SCAL"); // Templated-precision implementation of the routine StatusCode DoScal(const size_t n, const T alpha, diff --git a/include/internal/routines/level1/xswap.h b/include/internal/routines/level1/xswap.h index 3dabc62c..fe79882b 100644 --- a/include/internal/routines/level1/xswap.h +++ b/include/internal/routines/level1/xswap.h @@ -35,7 +35,7 @@ class Xswap: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xswap(Queue &queue, Event &event); + Xswap(Queue &queue, Event &event, const std::string &name = "SWAP"); // Templated-precision implementation of the routine StatusCode DoSwap(const size_t n, diff --git a/include/internal/routines/level3/xgemm.h b/include/internal/routines/level3/xgemm.h index a0c8b595..9b40a7fc 100644 --- a/include/internal/routines/level3/xgemm.h +++ b/include/internal/routines/level3/xgemm.h @@ -38,7 +38,7 @@ class Xgemm: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xgemm(Queue &queue, Event &event); + Xgemm(Queue &queue, Event &event, const std::string &name = "GEMM"); // Templated-precision implementation of the routine StatusCode DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, diff --git a/include/internal/routines/level3/xhemm.h b/include/internal/routines/level3/xhemm.h index 5f1e8723..ca38ca08 100644 --- a/include/internal/routines/level3/xhemm.h +++ b/include/internal/routines/level3/xhemm.h @@ -37,7 +37,7 @@ class Xhemm: public Xgemm<T> { using Xgemm<T>::DoGemm; // Constructor - Xhemm(Queue &queue, Event &event); + Xhemm(Queue &queue, Event &event, const std::string &name = "HEMM"); // Templated-precision implementation of the routine StatusCode DoHemm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xher2k.h b/include/internal/routines/level3/xher2k.h index 9e961d23..7113a172 100644 --- a/include/internal/routines/level3/xher2k.h +++ b/include/internal/routines/level3/xher2k.h @@ -40,7 +40,7 @@ class Xher2k: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xher2k(Queue &queue, Event &event); + Xher2k(Queue &queue, Event &event, const std::string &name = "HER2K"); // Templated-precision implementation of the routine StatusCode DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xherk.h b/include/internal/routines/level3/xherk.h index f285a71c..47112c2c 100644 --- a/include/internal/routines/level3/xherk.h +++ b/include/internal/routines/level3/xherk.h @@ -39,7 +39,7 @@ class Xherk: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xherk(Queue &queue, Event &event); + Xherk(Queue &queue, Event &event, const std::string &name = "HERK"); // Templated-precision implementation of the routine StatusCode DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xsymm.h b/include/internal/routines/level3/xsymm.h index 9ed3c722..9fc80eb4 100644 --- a/include/internal/routines/level3/xsymm.h +++ b/include/internal/routines/level3/xsymm.h @@ -39,7 +39,7 @@ class Xsymm: public Xgemm<T> { using Xgemm<T>::DoGemm; // Constructor - Xsymm(Queue &queue, Event &event); + Xsymm(Queue &queue, Event &event, const std::string &name = "SYMM"); // Templated-precision implementation of the routine StatusCode DoSymm(const Layout layout, const Side side, const Triangle triangle, diff --git a/include/internal/routines/level3/xsyr2k.h b/include/internal/routines/level3/xsyr2k.h index 85936658..c4679028 100644 --- a/include/internal/routines/level3/xsyr2k.h +++ b/include/internal/routines/level3/xsyr2k.h @@ -40,7 +40,7 @@ class Xsyr2k: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xsyr2k(Queue &queue, Event &event); + Xsyr2k(Queue &queue, Event &event, const std::string &name = "SYR2K"); // Templated-precision implementation of the routine StatusCode DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, diff --git a/include/internal/routines/level3/xsyrk.h b/include/internal/routines/level3/xsyrk.h index 14d51a58..abf6b681 100644 --- a/include/internal/routines/level3/xsyrk.h +++ b/include/internal/routines/level3/xsyrk.h @@ -41,7 +41,7 @@ class Xsyrk: public Routine<T> { using Routine<T>::ErrorIn; // Constructor - Xsyrk(Queue &queue, Event &event); + Xsyrk(Queue &queue, Event &event, const std::string &name = "SYRK"); // Templated-precision implementation of the routine StatusCode DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, diff --git a/include/internal/routines/level3/xtrmm.h b/include/internal/routines/level3/xtrmm.h index d8ac60fd..a1f4d15c 100644 --- a/include/internal/routines/level3/xtrmm.h +++ b/include/internal/routines/level3/xtrmm.h @@ -38,7 +38,7 @@ class Xtrmm: public Xgemm<T> { using Xgemm<T>::DoGemm; // Constructor - Xtrmm(Queue &queue, Event &event); + Xtrmm(Queue &queue, Event &event, const std::string &name = "TRMM"); // Templated-precision implementation of the routine StatusCode DoTrmm(const Layout layout, const Side side, const Triangle triangle, diff --git a/src/kernels/copy.opencl b/src/kernels/level3/copy.opencl index 7dde688b..7dde688b 100644 --- a/src/kernels/copy.opencl +++ b/src/kernels/level3/copy.opencl diff --git a/src/kernels/pad.opencl b/src/kernels/level3/pad.opencl index 69324f20..eefddce4 100644 --- a/src/kernels/pad.opencl +++ b/src/kernels/level3/pad.opencl @@ -117,6 +117,7 @@ __kernel void UnPadMatrix(const int src_one, const int src_two, } // ================================================================================================= +#if defined(ROUTINE_SYMM) // Kernel to populate a squared symmetric matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. @@ -185,8 +186,9 @@ __kernel void SymmUpperToSquared(const int src_dim, } } +#endif // ================================================================================================= -#if PRECISION == 3232 || PRECISION == 6464 +#if defined(ROUTINE_HEMM) && (PRECISION == 3232 || PRECISION == 6464) // Kernel to populate a squared hermitian matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. @@ -269,6 +271,7 @@ __kernel void HermUpperToSquared(const int src_dim, #endif // ================================================================================================= +#if defined(ROUTINE_TRMM) // Kernel to populate a squared triangular matrix, given that the triangle which holds the data is // stored as the lower-triangle of the input matrix. This uses the padding kernel's parameters. @@ -341,6 +344,7 @@ __kernel void TrmmUpperToSquared(const int src_dim, } } +#endif // ================================================================================================= // End of the C++11 raw string literal diff --git a/src/kernels/padtranspose.opencl b/src/kernels/level3/padtranspose.opencl index a6b70f0b..a6b70f0b 100644 --- a/src/kernels/padtranspose.opencl +++ b/src/kernels/level3/padtranspose.opencl diff --git a/src/kernels/transpose.opencl b/src/kernels/level3/transpose.opencl index d726f7ec..d726f7ec 100644 --- a/src/kernels/transpose.opencl +++ b/src/kernels/level3/transpose.opencl diff --git a/src/kernels/xgemm.opencl b/src/kernels/level3/xgemm.opencl index 8db0f557..8db0f557 100644 --- a/src/kernels/xgemm.opencl +++ b/src/kernels/level3/xgemm.opencl diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index ce138fa6..f37a0724 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -29,8 +29,8 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xaxpy<T>::Xaxpy(Queue &queue, Event &event): - Routine<T>(queue, event, "AXPY", {"Xaxpy"}, precision_) { +Xaxpy<T>::Xaxpy(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xaxpy.opencl" diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 52e029b9..2b00d43f 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -29,8 +29,8 @@ template <> const Precision Xcopy<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xcopy<T>::Xcopy(Queue &queue, Event &event): - Routine<T>(queue, event, "COPY", {"Xaxpy"}, precision_) { +Xcopy<T>::Xcopy(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xcopy.opencl" diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index 13e1080c..3fc36b3d 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -29,8 +29,8 @@ template <> const Precision Xscal<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xscal<T>::Xscal(Queue &queue, Event &event): - Routine<T>(queue, event, "SCAL", {"Xaxpy"}, precision_) { +Xscal<T>::Xscal(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xscal.opencl" diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index b22b3bdb..123977d3 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -29,8 +29,8 @@ template <> const Precision Xswap<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xswap<T>::Xswap(Queue &queue, Event &event): - Routine<T>(queue, event, "SWAP", {"Xaxpy"}, precision_) { +Xswap<T>::Xswap(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/level1/level1.opencl" #include "../../kernels/level1/xswap.opencl" diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 525a82e6..94aadcad 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -29,14 +29,14 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xgemm<T>::Xgemm(Queue &queue, Event &event): - Routine<T>(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xgemm<T>::Xgemm(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index a1c0c7c1..bcc60dee 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -21,8 +21,8 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhemm<T>::Xhemm(Queue &queue, Event &event): - Xgemm<T>(queue, event) { +Xhemm<T>::Xhemm(Queue &queue, Event &event, const std::string &name): + Xgemm<T>(queue, event, name) { } // ================================================================================================= diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 29b2f733..5b84decb 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -27,14 +27,14 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom // Constructor: forwards to base class constructor template <typename T, typename U> -Xher2k<T,U>::Xher2k(Queue &queue, Event &event): - Routine<T>(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xher2k<T,U>::Xher2k(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 5174e9ab..6a915c0b 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,14 +27,14 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp // Constructor: forwards to base class constructor template <typename T, typename U> -Xherk<T,U>::Xherk(Queue &queue, Event &event): - Routine<T>(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xherk<T,U>::Xherk(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 37c08d3b..583d5c7d 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -21,8 +21,8 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsymm<T>::Xsymm(Queue &queue, Event &event): - Xgemm<T>(queue, event) { +Xsymm<T>::Xsymm(Queue &queue, Event &event, const std::string &name): + Xgemm<T>(queue, event, name) { } // ================================================================================================= diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index b36e7c5e..de5f1afc 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -29,14 +29,14 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou // Constructor: forwards to base class constructor template <typename T> -Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event): - Routine<T>(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index e4668216..d8fc6357 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -29,14 +29,14 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xsyrk<T>::Xsyrk(Queue &queue, Event &event): - Routine<T>(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyrk<T>::Xsyrk(Queue &queue, Event &event, const std::string &name): + Routine<T>(queue, event, name, {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = - #include "../../kernels/copy.opencl" - #include "../../kernels/pad.opencl" - #include "../../kernels/transpose.opencl" - #include "../../kernels/padtranspose.opencl" - #include "../../kernels/xgemm.opencl" + #include "../../kernels/level3/copy.opencl" + #include "../../kernels/level3/pad.opencl" + #include "../../kernels/level3/transpose.opencl" + #include "../../kernels/level3/padtranspose.opencl" + #include "../../kernels/level3/xgemm.opencl" ; } diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 8be7d950..1180c026 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -21,8 +21,8 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xtrmm<T>::Xtrmm(Queue &queue, Event &event): - Xgemm<T>(queue, event) { +Xtrmm<T>::Xtrmm(Queue &queue, Event &event, const std::string &name): + Xgemm<T>(queue, event, name) { } // ================================================================================================= diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc index 23828b25..e2837e60 100644 --- a/src/tuning/copy.cc +++ b/src/tuning/copy.cc @@ -31,7 +31,7 @@ class TuneCopy { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/copy.opencl" + #include "../src/kernels/level3/copy.opencl" ; } diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc index 6a826b6b..72729422 100644 --- a/src/tuning/pad.cc +++ b/src/tuning/pad.cc @@ -31,7 +31,7 @@ class TunePad { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/pad.opencl" + #include "../src/kernels/level3/pad.opencl" ; } diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc index 3f233809..5edd89e0 100644 --- a/src/tuning/padtranspose.cc +++ b/src/tuning/padtranspose.cc @@ -31,7 +31,7 @@ class TunePadTranspose { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/padtranspose.opencl" + #include "../src/kernels/level3/padtranspose.opencl" ; } diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc index 3998ba66..113e0a81 100644 --- a/src/tuning/transpose.cc +++ b/src/tuning/transpose.cc @@ -31,7 +31,7 @@ class TuneTranspose { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/transpose.opencl" + #include "../src/kernels/level3/transpose.opencl" ; } diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc index e820cfb0..c06e3e72 100644 --- a/src/tuning/xgemm.cc +++ b/src/tuning/xgemm.cc @@ -31,7 +31,7 @@ class TuneXgemm { static std::string GetSources() { return #include "../src/kernels/common.opencl" - #include "../src/kernels/xgemm.opencl" + #include "../src/kernels/level3/xgemm.opencl" ; } |