summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/kernels/common.opencl6
-rw-r--r--src/kernels/level3/xgemm_part2.opencl15
-rw-r--r--src/routines/level3/xgemm.cc11
-rw-r--r--src/routines/level3/xher2k.cc18
-rw-r--r--src/routines/level3/xherk.cc14
-rw-r--r--src/routines/level3/xsyr2k.cc13
-rw-r--r--src/routines/level3/xsyrk.cc10
-rw-r--r--src/tuning/copy.cc2
-rw-r--r--src/tuning/pad.cc24
-rw-r--r--src/tuning/padtranspose.cc2
-rw-r--r--src/tuning/transpose.cc2
-rw-r--r--src/tuning/xgemm.cc8
12 files changed, 85 insertions, 40 deletions
diff --git a/src/kernels/common.opencl b/src/kernels/common.opencl
index f0da5a47..01605f6e 100644
--- a/src/kernels/common.opencl
+++ b/src/kernels/common.opencl
@@ -43,9 +43,9 @@ R"(
typedef half4 real4;
typedef half8 real8;
typedef half16 real16;
- #define ZERO 0.0h
- #define ONE 1.0h
- #define SMALLEST -1.0e37h
+ #define ZERO 0
+ #define ONE 1
+ #define SMALLEST -1.0e14
// Single-precision
#elif PRECISION == 32
diff --git a/src/kernels/level3/xgemm_part2.opencl b/src/kernels/level3/xgemm_part2.opencl
index c0760db6..a8c8ebf5 100644
--- a/src/kernels/level3/xgemm_part2.opencl
+++ b/src/kernels/level3/xgemm_part2.opencl
@@ -263,10 +263,13 @@ inline void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
// Main entry point of the kernel. This is the upper-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmUpper(const int kSizeN, const int kSizeK,
- const real alpha, const real beta,
+ const __constant real* restrict arg_alpha,
+ const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
+ const real alpha = arg_alpha[0];
+ const real beta = arg_beta[0];
// Skip these threads if they do not contain threads contributing to the upper-triangle
if (get_group_id(1)*NWG < get_group_id(0)*MWG) {
@@ -300,10 +303,13 @@ __kernel void XgemmUpper(const int kSizeN, const int kSizeK,
// Main entry point of the kernel. This is the lower-triangular version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void XgemmLower(const int kSizeN, const int kSizeK,
- const real alpha, const real beta,
+ const __constant real* restrict arg_alpha,
+ const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
+ const real alpha = arg_alpha[0];
+ const real beta = arg_beta[0];
// Skip these threads if they do not contain threads contributing to the lower-triangle
if (get_group_id(1)*NWG > get_group_id(0)*MWG) {
@@ -341,10 +347,13 @@ __kernel void XgemmLower(const int kSizeN, const int kSizeK,
// Main entry point of the kernel. This is the regular full version.
__attribute__((reqd_work_group_size(MDIMC, NDIMC, 1)))
__kernel void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
- const real alpha, const real beta,
+ const __constant real* restrict arg_alpha,
+ const __constant real* restrict arg_beta,
const __global realM* restrict agm,
const __global realN* restrict bgm,
__global realM* cgm) {
+ const real alpha = arg_alpha[0];
+ const real beta = arg_beta[0];
// Allocates workgroup-private memory (local memory)
#if SA == 1
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 11116aae..5395667a 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -123,6 +123,12 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
+ // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ auto beta_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+ beta_buffer.Write(queue_, 1, &beta);
+
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@@ -170,8 +176,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
kernel.SetArgument(0, static_cast<int>(m_ceiled));
kernel.SetArgument(1, static_cast<int>(n_ceiled));
kernel.SetArgument(2, static_cast<int>(k_ceiled));
- kernel.SetArgument(3, alpha);
- kernel.SetArgument(4, beta);
+ kernel.SetArgument(3, alpha_buffer());
+ kernel.SetArgument(4, beta_buffer());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, b_temp());
kernel.SetArgument(7, c_temp());
@@ -207,6 +213,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
// =================================================================================================
// Compiles the templated class
+template class Xgemm<half>;
template class Xgemm<float>;
template class Xgemm<double>;
template class Xgemm<float2>;
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index 2c2c815d..1acba517 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -112,6 +112,13 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+ auto complex_beta = T{beta, static_cast<U>(0.0)};
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ auto beta_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+ beta_buffer.Write(queue_, 1, &complex_beta);
+
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@@ -171,11 +178,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
- auto complex_beta = T{beta, static_cast<U>(0.0)};
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, alpha);
- kernel.SetArgument(3, complex_beta);
+ kernel.SetArgument(2, alpha_buffer());
+ kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a1_temp());
kernel.SetArgument(5, b2_temp());
kernel.SetArgument(6, c_temp());
@@ -196,8 +202,10 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
// Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha
auto conjugate_alpha = T{alpha.real(), -alpha.imag()};
auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)};
- kernel.SetArgument(2, conjugate_alpha);
- kernel.SetArgument(3, complex_one);
+ alpha_buffer.Write(queue_, 1, &conjugate_alpha);
+ beta_buffer.Write(queue_, 1, &complex_one);
+ kernel.SetArgument(2, alpha_buffer());
+ kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, b1_temp());
kernel.SetArgument(5, a2_temp());
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index 414c4760..ea1aa614 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -103,6 +103,14 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+ auto complex_alpha = T{alpha, static_cast<U>(0.0)};
+ auto complex_beta = T{beta, static_cast<U>(0.0)};
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ auto beta_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &complex_alpha);
+ beta_buffer.Write(queue_, 1, &complex_beta);
+
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@@ -144,12 +152,10 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
auto kernel = Kernel(program, kernel_name);
// Sets the kernel arguments
- auto complex_alpha = T{alpha, static_cast<U>(0.0)};
- auto complex_beta = T{beta, static_cast<U>(0.0)};
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, complex_alpha);
- kernel.SetArgument(3, complex_beta);
+ kernel.SetArgument(2, alpha_buffer());
+ kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 3206c669..c52e1353 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -104,6 +104,12 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ auto beta_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+ beta_buffer.Write(queue_, 1, &beta);
+
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@@ -147,8 +153,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, alpha);
- kernel.SetArgument(3, beta);
+ kernel.SetArgument(2, alpha_buffer());
+ kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, b_temp());
kernel.SetArgument(6, c_temp());
@@ -168,7 +174,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
// Swaps the arguments for matrices A and B, and sets 'beta' to 1
auto one = static_cast<T>(1);
- kernel.SetArgument(3, one);
+ beta_buffer.Write(queue_, 1, &one);
+ kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, b_temp());
kernel.SetArgument(5, a_temp());
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index 741ad064..cfcd4e12 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -97,6 +97,12 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
+ // Upload the scalar arguments as constant buffers to the device (needed for half-precision)
+ auto alpha_buffer = Buffer<T>(context_, 1);
+ auto beta_buffer = Buffer<T>(context_, 1);
+ alpha_buffer.Write(queue_, 1, &alpha);
+ beta_buffer.Write(queue_, 1, &beta);
+
// Events of all kernels (including pre/post processing kernels)
auto eventWaitList = std::vector<Event>();
auto emptyEventList = std::vector<Event>();
@@ -131,8 +137,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
// Sets the kernel arguments
kernel.SetArgument(0, static_cast<int>(n_ceiled));
kernel.SetArgument(1, static_cast<int>(k_ceiled));
- kernel.SetArgument(2, alpha);
- kernel.SetArgument(3, beta);
+ kernel.SetArgument(2, alpha_buffer());
+ kernel.SetArgument(3, beta_buffer());
kernel.SetArgument(4, a_temp());
kernel.SetArgument(5, a_temp());
kernel.SetArgument(6, c_temp());
diff --git a/src/tuning/copy.cc b/src/tuning/copy.cc
index e2837e60..09cdecf1 100644
--- a/src/tuning/copy.cc
+++ b/src/tuning/copy.cc
@@ -107,7 +107,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
- case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneCopy<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneCopy<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneCopy<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneCopy<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/pad.cc b/src/tuning/pad.cc
index 72729422..075688db 100644
--- a/src/tuning/pad.cc
+++ b/src/tuning/pad.cc
@@ -85,17 +85,17 @@ class TunePad {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &,
std::vector<T> &) {
- tuner.AddArgumentScalar(static_cast<int>(args.m));
- tuner.AddArgumentScalar(static_cast<int>(args.n));
- tuner.AddArgumentScalar(static_cast<int>(args.m));
- tuner.AddArgumentScalar(0);
- tuner.AddArgumentInput(a_mat);
- tuner.AddArgumentScalar(static_cast<int>(args.m));
- tuner.AddArgumentScalar(static_cast<int>(args.n));
- tuner.AddArgumentScalar(static_cast<int>(args.m));
- tuner.AddArgumentScalar(0);
- tuner.AddArgumentOutput(b_mat);
- tuner.AddArgumentScalar(0);
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(static_cast<int>(args.n));
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(0);
+ tuner.AddArgumentInput(a_mat);
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(static_cast<int>(args.n));
+ tuner.AddArgumentScalar(static_cast<int>(args.m));
+ tuner.AddArgumentScalar(0);
+ tuner.AddArgumentOutput(b_mat);
+ tuner.AddArgumentScalar(0);
}
// Describes how to compute the performance metrics
@@ -115,7 +115,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
- case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePad<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePad<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePad<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePad<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/padtranspose.cc b/src/tuning/padtranspose.cc
index 5edd89e0..a970f982 100644
--- a/src/tuning/padtranspose.cc
+++ b/src/tuning/padtranspose.cc
@@ -119,7 +119,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
- case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TunePadTranspose<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TunePadTranspose<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TunePadTranspose<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TunePadTranspose<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/transpose.cc b/src/tuning/transpose.cc
index 113e0a81..d217a3df 100644
--- a/src/tuning/transpose.cc
+++ b/src/tuning/transpose.cc
@@ -112,7 +112,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
- case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneTranspose<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneTranspose<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneTranspose<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneTranspose<float2>, float2>(argc, argv); break;
diff --git a/src/tuning/xgemm.cc b/src/tuning/xgemm.cc
index 2b4ff456..d309b830 100644
--- a/src/tuning/xgemm.cc
+++ b/src/tuning/xgemm.cc
@@ -121,11 +121,13 @@ class TuneXgemm {
std::vector<T> &, std::vector<T> &,
std::vector<T> &a_mat, std::vector<T> &b_mat, std::vector<T> &c_mat,
std::vector<T> &) {
+ auto alpha_buffer = std::vector<T>{args.alpha};
+ auto beta_buffer = std::vector<T>{args.beta};
tuner.AddArgumentScalar(static_cast<int>(args.m));
tuner.AddArgumentScalar(static_cast<int>(args.n));
tuner.AddArgumentScalar(static_cast<int>(args.k));
- tuner.AddArgumentScalar(args.alpha);
- tuner.AddArgumentScalar(args.beta);
+ tuner.AddArgumentInput(alpha_buffer);
+ tuner.AddArgumentInput(beta_buffer);
tuner.AddArgumentInput(a_mat);
tuner.AddArgumentInput(b_mat);
tuner.AddArgumentOutput(c_mat);
@@ -148,7 +150,7 @@ using double2 = clblast::double2;
// Main function (not within the clblast namespace)
int main(int argc, char *argv[]) {
switch(clblast::GetPrecision(argc, argv)) {
- case clblast::Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+ case clblast::Precision::kHalf: clblast::Tuner<clblast::TuneXgemm<half>, half>(argc, argv); break;
case clblast::Precision::kSingle: clblast::Tuner<clblast::TuneXgemm<float>, float>(argc, argv); break;
case clblast::Precision::kDouble: clblast::Tuner<clblast::TuneXgemm<double>, double>(argc, argv); break;
case clblast::Precision::kComplexSingle: clblast::Tuner<clblast::TuneXgemm<float2>, float2>(argc, argv); break;