diff options
Diffstat (limited to 'src/routines')
-rw-r--r-- | src/routines/common.hpp | 8 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.cpp | 8 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cpp | 10 | ||||
-rw-r--r-- | src/routines/level2/xger.cpp | 6 | ||||
-rw-r--r-- | src/routines/level2/xher.cpp | 6 | ||||
-rw-r--r-- | src/routines/level2/xher2.cpp | 6 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cpp | 10 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cpp | 16 | ||||
-rw-r--r-- | src/routines/level3/xherk.cpp | 10 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cpp | 13 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cpp | 10 |
11 files changed, 24 insertions, 79 deletions
diff --git a/src/routines/common.hpp b/src/routines/common.hpp index c99cd39d..e624a2b1 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -88,10 +88,6 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont } } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context, 1); - alpha_buffer.Write(queue, 1, &alpha); - // Retrieves the kernel from the compiled binary try { auto kernel = Kernel(program, kernel_name); @@ -101,7 +97,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont kernel.SetArgument(0, static_cast<int>(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); - kernel.SetArgument(3, alpha_buffer()); + kernel.SetArgument(3, GetRealArg(alpha)); } else { kernel.SetArgument(0, static_cast<int>(src_one)); @@ -114,7 +110,7 @@ StatusCode PadCopyTransposeMatrix(Queue &queue, const Device &device, const Cont kernel.SetArgument(7, static_cast<int>(dest_ld)); kernel.SetArgument(8, static_cast<int>(dest_offset)); kernel.SetArgument(9, dest()); - kernel.SetArgument(10, alpha_buffer()); + kernel.SetArgument(10, GetRealArg(alpha)); if (do_pad) { kernel.SetArgument(11, static_cast<int>(do_conjugate)); } diff --git a/src/routines/level1/xaxpy.cpp b/src/routines/level1/xaxpy.cpp index 5b6c9e77..3445e2b5 100644 --- a/src/routines/level1/xaxpy.cpp +++ b/src/routines/level1/xaxpy.cpp @@ -59,20 +59,16 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); auto kernel = Kernel(program, kernel_name); - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Sets the kernel arguments if (use_fast_kernel) { kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, y_buffer()); } else { kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); diff --git a/src/routines/level2/xgemv.cpp b/src/routines/level2/xgemv.cpp index 21fb397c..2842ef07 100644 --- a/src/routines/level2/xgemv.cpp +++ b/src/routines/level2/xgemv.cpp @@ -126,12 +126,6 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, local_size = db_["WGS3"]; } - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - auto beta_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Retrieves the Xgemv kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); @@ -140,8 +134,8 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(m_real)); kernel.SetArgument(1, static_cast<int>(n_real)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, static_cast<int>(a_rotated)); kernel.SetArgument(5, a_buffer()); kernel.SetArgument(6, static_cast<int>(a_offset)); diff --git a/src/routines/level2/xger.cpp b/src/routines/level2/xger.cpp index 353047d2..29cffe0c 100644 --- a/src/routines/level2/xger.cpp +++ b/src/routines/level2/xger.cpp @@ -56,10 +56,6 @@ StatusCode Xger<T>::DoGer(const Layout layout, status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); @@ -68,7 +64,7 @@ StatusCode Xger<T>::DoGer(const Layout layout, // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(a_one)); kernel.SetArgument(1, static_cast<int>(a_two)); - kernel.SetArgument(2, alpha_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); kernel.SetArgument(3, x_buffer()); kernel.SetArgument(4, static_cast<int>(x_offset)); kernel.SetArgument(5, static_cast<int>(x_inc)); diff --git a/src/routines/level2/xher.cpp b/src/routines/level2/xher.cpp index ed8ba9e9..6dd95938 100644 --- a/src/routines/level2/xher.cpp +++ b/src/routines/level2/xher.cpp @@ -70,10 +70,6 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, // Creates a matching version of alpha const auto matching_alpha = GetAlpha(alpha); - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &matching_alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); @@ -81,7 +77,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(matching_alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); diff --git a/src/routines/level2/xher2.cpp b/src/routines/level2/xher2.cpp index 50572cea..3d57a9b9 100644 --- a/src/routines/level2/xher2.cpp +++ b/src/routines/level2/xher2.cpp @@ -58,10 +58,6 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, status = TestVectorY(n, y_buffer, y_offset, y_inc); if (ErrorIn(status)) { return status; } - // Upload the scalar argument as a constant buffer to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - // Retrieves the kernel from the compiled binary try { const auto program = GetProgramFromCache(context_, PrecisionValue<T>(), routine_name_); @@ -69,7 +65,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n)); - kernel.SetArgument(1, alpha_buffer()); + kernel.SetArgument(1, GetRealArg(alpha)); kernel.SetArgument(2, x_buffer()); kernel.SetArgument(3, static_cast<int>(x_offset)); kernel.SetArgument(4, static_cast<int>(x_inc)); diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp index 9ea5559c..97e8db7e 100644 --- a/src/routines/level3/xgemm.cpp +++ b/src/routines/level3/xgemm.cpp @@ -118,12 +118,6 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - auto beta_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector<Event>(); auto emptyEventList = std::vector<Event>(); @@ -174,8 +168,8 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, kernel.SetArgument(0, static_cast<int>(m_ceiled)); kernel.SetArgument(1, static_cast<int>(n_ceiled)); kernel.SetArgument(2, static_cast<int>(k_ceiled)); - kernel.SetArgument(3, alpha_buffer()); - kernel.SetArgument(4, beta_buffer()); + kernel.SetArgument(3, GetRealArg(alpha)); + kernel.SetArgument(4, GetRealArg(beta)); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, b_temp()); kernel.SetArgument(7, c_temp()); diff --git a/src/routines/level3/xher2k.cpp b/src/routines/level3/xher2k.cpp index bd7a053e..65e2be55 100644 --- a/src/routines/level3/xher2k.cpp +++ b/src/routines/level3/xher2k.cpp @@ -107,12 +107,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + // Convert the arguments to complex versions auto complex_beta = T{beta, static_cast<U>(0.0)}; - auto alpha_buffer = Buffer<T>(context_, 1); - auto beta_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &complex_beta); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector<Event>(); @@ -180,8 +176,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a1_temp()); kernel.SetArgument(5, b2_temp()); kernel.SetArgument(6, c_temp()); @@ -202,10 +198,8 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co // Swaps the arguments for matrices A and B, sets 'beta' to 1, and conjugate alpha auto conjugate_alpha = T{alpha.real(), -alpha.imag()}; auto complex_one = T{static_cast<U>(1.0), static_cast<U>(0.0)}; - alpha_buffer.Write(queue_, 1, &conjugate_alpha); - beta_buffer.Write(queue_, 1, &complex_one); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(conjugate_alpha)); + kernel.SetArgument(3, GetRealArg(complex_one)); kernel.SetArgument(4, b1_temp()); kernel.SetArgument(5, a2_temp()); diff --git a/src/routines/level3/xherk.cpp b/src/routines/level3/xherk.cpp index 6ef7f21f..cc87e3e9 100644 --- a/src/routines/level3/xherk.cpp +++ b/src/routines/level3/xherk.cpp @@ -98,13 +98,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) + // Convert the arguments to complex versions auto complex_alpha = T{alpha, static_cast<U>(0.0)}; auto complex_beta = T{beta, static_cast<U>(0.0)}; - auto alpha_buffer = Buffer<T>(context_, 1); - auto beta_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &complex_alpha); - beta_buffer.Write(queue_, 1, &complex_beta); // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector<Event>(); @@ -152,8 +148,8 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(complex_alpha)); + kernel.SetArgument(3, GetRealArg(complex_beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); diff --git a/src/routines/level3/xsyr2k.cpp b/src/routines/level3/xsyr2k.cpp index 424d4d2d..18a1eac7 100644 --- a/src/routines/level3/xsyr2k.cpp +++ b/src/routines/level3/xsyr2k.cpp @@ -97,12 +97,6 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - auto beta_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector<Event>(); auto emptyEventList = std::vector<Event>(); @@ -149,8 +143,8 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, b_temp()); kernel.SetArgument(6, c_temp()); @@ -170,8 +164,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons // Swaps the arguments for matrices A and B, and sets 'beta' to 1 auto one = static_cast<T>(1); - beta_buffer.Write(queue_, 1, &one); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(3, GetRealArg(one)); kernel.SetArgument(4, b_temp()); kernel.SetArgument(5, a_temp()); diff --git a/src/routines/level3/xsyrk.cpp b/src/routines/level3/xsyrk.cpp index f56c232b..1992cec1 100644 --- a/src/routines/level3/xsyrk.cpp +++ b/src/routines/level3/xsyrk.cpp @@ -90,12 +90,6 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); - // Upload the scalar arguments as constant buffers to the device (needed for half-precision) - auto alpha_buffer = Buffer<T>(context_, 1); - auto beta_buffer = Buffer<T>(context_, 1); - alpha_buffer.Write(queue_, 1, &alpha); - beta_buffer.Write(queue_, 1, &beta); - // Events of all kernels (including pre/post processing kernels) auto eventWaitList = std::vector<Event>(); auto emptyEventList = std::vector<Event>(); @@ -132,8 +126,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const // Sets the kernel arguments kernel.SetArgument(0, static_cast<int>(n_ceiled)); kernel.SetArgument(1, static_cast<int>(k_ceiled)); - kernel.SetArgument(2, alpha_buffer()); - kernel.SetArgument(3, beta_buffer()); + kernel.SetArgument(2, GetRealArg(alpha)); + kernel.SetArgument(3, GetRealArg(beta)); kernel.SetArgument(4, a_temp()); kernel.SetArgument(5, a_temp()); kernel.SetArgument(6, c_temp()); |