summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-05-22 16:18:08 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-05-22 16:18:08 +0200
commitc8ff3f143fe94c87b23fd1bf36c1a4f91d305f01 (patch)
tree9c32e1a944c58ffcd711a7a69903fe6f9f95911d
parent95b828da124b9c5c101d95cb51a12e9d387d1a34 (diff)
Prepared the GER kernels and tuner for half-precision support
-rw-r--r--include/internal/routines/level2/xger.h1
-rw-r--r--include/internal/routines/level2/xher.h1
-rw-r--r--include/internal/routines/level2/xher2.h1
-rw-r--r--src/kernels/level2/xger.opencl4
-rw-r--r--src/kernels/level2/xher.opencl4
-rw-r--r--src/kernels/level2/xher2.opencl4
-rw-r--r--src/routines/level2/xger.cc8
-rw-r--r--src/routines/level2/xher.cc14
-rw-r--r--src/routines/level2/xher2.cc8
-rw-r--r--src/tuning/xger.cc5
10 files changed, 36 insertions, 14 deletions
diff --git a/include/internal/routines/level2/xger.h b/include/internal/routines/level2/xger.h
index 5ace9da6..1d5c64bd 100644
--- a/include/internal/routines/level2/xger.h
+++ b/include/internal/routines/level2/xger.h
@@ -29,6 +29,7 @@ class Xger: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
+ using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
diff --git a/include/internal/routines/level2/xher.h b/include/internal/routines/level2/xher.h
index 861ba302..ebd20ee8 100644
--- a/include/internal/routines/level2/xher.h
+++ b/include/internal/routines/level2/xher.h
@@ -29,6 +29,7 @@ class Xher: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
+ using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestMatrixA;
diff --git a/include/internal/routines/level2/xher2.h b/include/internal/routines/level2/xher2.h
index 9a23199e..a33a71c3 100644
--- a/include/internal/routines/level2/xher2.h
+++ b/include/internal/routines/level2/xher2.h
@@ -29,6 +29,7 @@ class Xher2: public Routine<T> {
using Routine<T>::source_string_;
using Routine<T>::queue_;
using Routine<T>::event_;
+ using Routine<T>::context_;
using Routine<T>::GetProgramFromCache;
using Routine<T>::TestVectorX;
using Routine<T>::TestVectorY;
diff --git a/src/kernels/level2/xger.opencl b/src/kernels/level2/xger.opencl
index d377fbb0..63817afb 100644
--- a/src/kernels/level2/xger.opencl
+++ b/src/kernels/level2/xger.opencl
@@ -19,11 +19,13 @@ R"(
// Regular version of the rank-1 matrix update kernel (GER, GERU, GERC)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xger(const int max1, const int max2, const real alpha,
+__kernel void Xger(const int max1, const int max2,
+ const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_rowmajor) {
+ const real alpha = arg_alpha[0];
// Register storage for X and Y
real xvalues[WPT];
diff --git a/src/kernels/level2/xher.opencl b/src/kernels/level2/xher.opencl
index edb94ca8..fc635f2e 100644
--- a/src/kernels/level2/xher.opencl
+++ b/src/kernels/level2/xher.opencl
@@ -19,10 +19,12 @@ R"(
// Symmetric version of the rank-1 matrix update kernel (HER, HPR, SYR, SPR)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher(const int n, const real alpha,
+__kernel void Xher(const int n,
+ const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
+ const real alpha = arg_alpha[0];
// Register storage for X and XT
real xvalues[WPT];
diff --git a/src/kernels/level2/xher2.opencl b/src/kernels/level2/xher2.opencl
index 4a2edce8..a66f255f 100644
--- a/src/kernels/level2/xher2.opencl
+++ b/src/kernels/level2/xher2.opencl
@@ -19,11 +19,13 @@ R"(
// Symmetric version of the rank-2 matrix update kernel (HER2, HPR2, SYR2, SPR2)
__attribute__((reqd_work_group_size(WGS1, WGS2, 1)))
-__kernel void Xher2(const int n, const real alpha,
+__kernel void Xher2(const int n,
+ const __constant real* restrict arg_alpha,
const __global real* restrict xgm, const int x_offset, const int x_inc,
const __global real* restrict ygm, const int y_offset, const int y_inc,
__global real* restrict agm, const int a_offset, const int a_ld,
const int is_upper, const int is_rowmajor) {
+ const real alpha = arg_alpha[0];
// Register storage for X and Y
real xvalues[WPT];
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc
index 686c7e60..47d7abe2 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@@ -64,7 +64,11 @@ StatusCode Xger<T>::DoGer(const Layout layout,
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
- // Retrieves the Xgemv kernel from the compiled binary
+ // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+
+ // Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xger");
@@ -72,7 +76,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(a_one));
kernel.SetArgument(1, static_cast<int>(a_two));
- kernel.SetArgument(2, alpha);
+ kernel.SetArgument(2, alpha_buffer());
kernel.SetArgument(3, x_buffer());
kernel.SetArgument(4, static_cast<int>(x_offset));
kernel.SetArgument(5, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc
index a7116213..852e3f15 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@@ -63,9 +63,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
(triangle == Triangle::kLower && layout == Layout::kRowMajor));
const auto is_rowmajor = (layout == Layout::kRowMajor);
- // Creates a matching version of alpha
- const auto matching_alpha = GetAlpha(alpha);
-
// Tests the matrix and the vectors for validity
auto status = StatusCode::kSuccess;
if (packed) { status = TestMatrixAP(n, a_buffer, a_offset, sizeof(T)); }
@@ -77,14 +74,21 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// If alpha is zero an update is not required
if (alpha == U{0}) { return StatusCode::kSuccess; }
- // Retrieves the Xgemv kernel from the compiled binary
+ // Creates a matching version of alpha
+ const auto matching_alpha = GetAlpha(alpha);
+
+ // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &matching_alpha);
+
+ // Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xher");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, matching_alpha);
+ kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc
index 3fd1a961..82052187 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@@ -66,14 +66,18 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
status = TestVectorY(n, y_buffer, y_offset, y_inc, sizeof(T));
if (ErrorIn(status)) { return status; }
- // Retrieves the Xgemv kernel from the compiled binary
+ // Upload the scalar argument as a constant buffer to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+
+ // Retrieves the kernel from the compiled binary
try {
const auto program = GetProgramFromCache();
auto kernel = Kernel(program, "Xher2");
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n));
- kernel.SetArgument(1, alpha);
+ kernel.SetArgument(1, alpha_buffer());
kernel.SetArgument(2, x_buffer());
kernel.SetArgument(3, static_cast<int>(x_offset));
kernel.SetArgument(4, static_cast<int>(x_inc));
diff --git a/src/tuning/xger.cc b/src/tuning/xger.cc
index 39efdb81..4be80c86 100644
--- a/src/tuning/xger.cc
+++ b/src/tuning/xger.cc
@@ -85,9 +85,10 @@ class TuneXger {
std::vector<T> &x_vec, std::vector<T> &y_vec,
std::vector<T> &a_mat, std::vector<T> &, std::vector<T> &,
std::vector<T> &) {
+ auto alpha_buffer = std::vector<T>{args.alpha};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
- tuner.AddArgumentScalar(args.alpha);
+ tuner.AddArgumentInput(alpha_buffer);
tuner.AddArgumentInput(x_vec);
tuner.AddArgumentScalar(0); // x_offset
tuner.AddArgumentScalar(1); // x_increment
@@ -117,7 +118,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
- case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXger<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXger<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXger<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXger<float2>, float2>(argc, argv); break;