diff options
-rw-r--r-- | include/internal/routines/level2/xger.h | 1 | ||||
-rw-r--r-- | include/internal/routines/level2/xher.h | 1 | ||||
-rw-r--r-- | include/internal/routines/level2/xher2.h | 1 | ||||
-rw-r--r-- | src/kernels/level2/xger.opencl | 4 | ||||
-rw-r--r-- | src/kernels/level2/xher.opencl | 4 | ||||
-rw-r--r-- | src/kernels/level2/xher2.opencl | 4 | ||||
-rw-r--r-- | src/routines/level2/xger.cc | 8 | ||||
-rw-r--r-- | src/routines/level2/xher.cc | 14 | ||||
-rw-r--r-- | src/routines/level2/xher2.cc | 8 | ||||
-rw-r--r-- | src/tuning/xger.cc | 5 |
10 files changed, 36 insertions, 14 deletions
diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h index 5ace9da6..1d5c64bd 100644 --- a/include/internal/routines/level2/xger.h +++ b/include/internal/routines/level2/xger.h @@ -29,6 +29,7 @@ class Xger: public Routine<T> { using Routine<T>::source_string_; using Routine<T>::queue_; using Routine<T>::event_; + using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h index 861ba302..ebd20ee8 100644 --- a/include/internal/routines/level2/xher.h +++ b/include/internal/routines/level2/xher.h @@ -29,6 +29,7 @@ class Xher: public Routine<T> { using Routine<T>::source_string_; using Routine<T>::queue_; using Routine<T>::event_; + using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestMatrixA; diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h index 9a23199e..a33a71c3 100644 --- a/include/internal/routines/level2/xher2.h +++ b/include/internal/routines/level2/xher2.h @@ -29,6 +29,7 @@ class Xher2: public Routine<T> { using Routine<T>::source_string_; using Routine<T>::queue_; using Routine<T>::event_; + using Routine<T>::context_; using Routine<T>::GetProgramFromCache; using Routine<T>::TestVectorX; using Routine<T>::TestVectorY; diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl index d377fbb0..63817afb 100644 --- a/src/kernels/level2/xger.opencl +++ b/src/kernels/level2/xger.opencl @@ -19,11 +19,13 @@ R"( // Regular version of the rank-1 matrix update kernel (GER, GERU, GERC) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xger(const int max1, const int max2, const real alpha, +__kernel void Xger(const int max1, const int max2, + const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_rowmajor) { + const real alpha = arg_alpha[0]; // Register storage for X and Y real xvalues[WPT]; diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl index edb94ca8..fc635f2e 100644 --- a/src/kernels/level2/xher.opencl +++ b/src/kernels/level2/xher.opencl @@ -19,10 +19,12 @@ R"( // Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher(const int n, const real alpha, +__kernel void Xher(const int n, + const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { + const real alpha = arg_alpha[0]; // Register storage for X and XT real xvalues[WPT]; diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl index 4a2edce8..a66f255f 100644 --- a/src/kernels/level2/xher2.opencl +++ b/src/kernels/level2/xher2.opencl @@ -19,11 +19,13 @@ R"( // Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2) __attribute__((reqd_work_group_size(WGS1, WGS2, 1))) -__kernel void Xher2(const int n, const real alpha, +__kernel void Xher2(const int n, + const __constant real* restrict arg_alpha, const __global real* restrict xgm, const int x_offset, const int x_inc, const __global real* restrict ygm, const int y_offset, const int y_inc, __global real* restrict agm, const int a_offset, const int a_ld, const int is_upper, const int is_rowmajor) { + const real alpha = arg_alpha[0]; // Register storage for X and Y real xvalues[WPT]; diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 686c7e60..47d7abe2 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -64,7 +64,11 @@ StatusCode Xger<T>::DoGer(const Layout layout, status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); if (ErrorIn(status)) { return status; } - // Retrieves the Xgemv kernel from the compiled binary + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer<T>(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xger"); @@ -72,7 +76,7 @@ StatusCode Xger<T>::DoGer(const Layout layout, // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(a_one)); kernel.SetArgument(1, static_cast<int>(a_two)); - kernel.SetArgument(2, alpha); + kernel.SetArgument(2, alpha_buffer()); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast<int>(x_offset)); kernel.SetArgument(5, static_cast<int>(x_inc)); diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index a7116213..852e3f15 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -63,9 +63,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, (triangle == Triangle::kLower && layout == Layout::kRowMajor)); const auto is_rowmajor = (layout == Layout::kRowMajor); - // Creates a matching version of alpha - const auto matching_alpha = GetAlpha(alpha); - // Tests the matrix and the vectors for validity auto status = StatusCode::kSuccess; if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); } @@ -77,14 +74,21 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, // If alpha is zero an update is not required if (alpha == U{0}) { return StatusCode::kSuccess; } - // Retrieves the Xgemv kernel from the compiled binary + // Creates a matching version of alpha + const auto matching_alpha = GetAlpha(alpha); + + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer<T>(context_, 1); + alpha_buffer.Write(queue_, 1, &matching_alpha); + + // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, matching_alpha); + kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index 3fd1a961..82052187 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -66,14 +66,18 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T)); if (ErrorIn(status)) { return status; } - // Retrieves the Xgemv kernel from the compiled binary + // Upload the scalar argument as a constant buffer to the device (needed for half-precision) + auto alpha_buffer = Buffer<T>(context_, 1); + alpha_buffer.Write(queue_, 1, &alpha); + + // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher2"); // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha); + kernel.SetArgument(1, alpha_buffer()); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); diff --git a/src/tuning/xger.cc b/src/tuning/xger.cc index 39efdb81..4be80c86 100644 --- a/src/tuning/xger.cc +++ b/src/tuning/xger.cc @@ -85,9 +85,10 @@ class TuneXger { std::vector<T> &x_vec, std::vector<T> &y_vec, std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &, std::vector<T> &) { + auto alpha_buffer = std::vector<T>{args.alpha}; tuner.AddArgumentScalar(static_cast<int>(args.m)); tuner.AddArgumentScalar(static_cast<int>(args.n)); - tuner.AddArgumentScalar(args.alpha); + tuner.AddArgumentInput(alpha_buffer); tuner.AddArgumentInput(x_vec); tuner.AddArgumentScalar(0); // x_offset tuner.AddArgumentScalar(1); // x_increment @@ -117,7 +118,7 @@ using double2 = clblast::double2; // Main function (not within the clblast namespace) int main(int argc, char *argv[]) { switch(clblast::GetPrecision(argc, argv)) { - case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode"); + case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXger<half>, half>(argc, argv); break; case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break; case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break; case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break; |