Added initial naive version of Xgemv kernel

author: CNugteren <web@cedricnugteren.nl> 2015-06-10 08:44:30 +0200
committer: CNugteren <web@cedricnugteren.nl> 2015-06-10 08:44:30 +0200
commit: 85c1db93221bf9d71083c6725a33ccbcd1b61de4 (patch)
tree: e53e4d62d53cc85c4383bccea12904f27c4ac7bc /src
parent: 3c17c1c13313022879c8caf289d0f47ea5d7d22d (diff)
3 files changed, 207 insertions, 0 deletions
diff --git a/src/kernels/xgemv.opencl b/src/kernels/xgemv.opencl
new file mode 100644
index 00000000..c90bc26e
--- /dev/null
+++ b/src/kernels/xgemv.opencl
@@ -0,0 +1,60 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains the Xgemv kernel for matrix-vector multiplication.
+//
+// =================================================================================================
+
+// Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
+// literal). Comment-out this line for syntax-highlighting when developing.
+R"(
+
+// =================================================================================================
+
+// Parameters set by the tuner or by the database. Here they are given a basic default value in case
+// this kernel file is used outside of the CLBlast library.
+#ifndef WGS
+  #define WGS 64     // The local work-group size
+#endif
+#ifndef WPT
+  #define WPT 1      // The amount of work-per-thread
+#endif
+#ifndef VW
+  #define VW 1       // Vector width of vectors X and Y
+#endif
+
+// =================================================================================================
+
+// The gemv kernel
+__attribute__((reqd_work_group_size(WGS, 1, 1)))
+__kernel void Xgemv(const int m, const int n, const real alpha, const real beta,
+                    const __global real* restrict agm,
+                    const __global real* restrict xgm, const int x_offset, const int x_inc,
+                    __global real* ygm, const int y_offset, const int y_inc) {
+
+  // Loops over the work that needs to be done (allows for an arbitrary number of threads)
+  #pragma unroll
+  for (int id = get_global_id(0); id<m; id += get_global_size(0)) {
+
+    // Loop over the elements of the matrix A
+    real acc;
+    SetToZero(acc);
+    for (int k=0; k<n; ++k) {
+      MultiplyAdd(acc, agm[id + m*k], xgm[k*x_inc + x_offset]);
+    }
+    AXPBY(ygm[id*y_inc + y_offset], alpha, acc, beta, ygm[id*y_inc + y_offset]);
+  }
+}
+
+// =================================================================================================
+
+// End of the C++11 raw string literal
+)";
+
+// =================================================================================================
diff --git a/src/tuning/tuning.cc b/src/tuning/tuning.cc
index bb93c053..94333089 100644
--- a/src/tuning/tuning.cc
+++ b/src/tuning/tuning.cc
@@ -73,6 +73,64 @@ template void TunerXY<double2>(int, char**, const Tuner2<double2>&);
 // =================================================================================================
 
 // Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
+// the results. Used for matrix-vector-vector routines.
+template <typename T>
+void TunerAXY(int argc, char* argv[], const Tuner3<T> &tune_function) {
+
+  // Sets the parameters and platform/device for which to tune (command-line options)
+  auto help = std::string{"* Options given/available:\n"};
+  auto args = Arguments<T>{};
+  args.platform_id = GetArgument(argc, argv, help, kArgPlatform, size_t{0});
+  args.device_id   = GetArgument(argc, argv, help, kArgDevice, size_t{0});
+  args.precision   = GetArgument(argc, argv, help, kArgPrecision, Precision::kSingle);
+  args.m           = GetArgument(argc, argv, help, kArgM, size_t{1024});
+  args.n           = GetArgument(argc, argv, help, kArgN, size_t{1024});
+  args.alpha       = GetArgument(argc, argv, help, kArgAlpha, GetScalar<T>());
+  args.beta        = GetArgument(argc, argv, help, kArgBeta, GetScalar<T>());
+  fprintf(stdout, "%s\n", help.c_str());
+
+  // Creates input buffers with random data
+  auto a_mat = std::vector<T>(args.m * args.n);
+  auto x_vec = std::vector<T>(args.n);
+  auto y_vec = std::vector<T>(args.m);
+  PopulateVector(a_mat);
+  PopulateVector(x_vec);
+  PopulateVector(y_vec);
+
+  // Initializes the tuner for the chosen device
+  cltune::Tuner tuner(args.platform_id, args.device_id);
+
+  // Use full-search to explore all parameter combinations.
+  tuner.UseFullSearch();
+
+  // Configures the tuning parameters (kernel specific)
+  tune_function(args, a_mat, x_vec, y_vec, tuner);
+
+  // Starts the tuning process
+  tuner.Tune();
+
+  // Prints the results to screen
+  auto time_ms = tuner.PrintToScreen();
+  tuner.PrintFormatted();
+
+  // Also prints the performance of the best-case in terms of GB/s and GFLOPS
+  const auto mega_bytes = ((args.m*args.n + 2*args.m + args.n)*GetBytes(args.precision)) * 1.0e-6;
+  const auto mega_flops = (2*args.m*args.n) * 1.0e-6;
+  if (time_ms != 0.0) {
+    printf("[ -------> ] %.1lf ms or %.1lf GB/s or %.1lf GFLOPS\n",
+           time_ms, mega_bytes/time_ms, mega_flops/time_ms);
+  }
+}
+
+// Compiles the above function
+template void TunerAXY<float>(int, char**, const Tuner3<float>&);
+template void TunerAXY<double>(int, char**, const Tuner3<double>&);
+template void TunerAXY<float2>(int, char**, const Tuner3<float2>&);
+template void TunerAXY<double2>(int, char**, const Tuner3<double2>&);
+
+// =================================================================================================
+
+// Function to get command-line argument, set-up the input buffers, configure the tuner, and collect
 // the results. Used for matrix-matrix routines.
 template <typename T>
 void TunerAB(int argc, char* argv[], const Tuner2<T> &tune_function) {
diff --git a/src/tuning/xgemv.cc b/src/tuning/xgemv.cc
new file mode 100644
index 00000000..1ee7c7bf
--- /dev/null
+++ b/src/tuning/xgemv.cc
@@ -0,0 +1,89 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file implements an auto-tuner to tune the Xgemv OpenCL kernel. It uses the CLTune library.
+//
+// =================================================================================================
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#include "internal/utilities.h"
+#include "internal/tuning.h"
+
+namespace clblast {
+// =================================================================================================
+
+// The Xgemv auto-tuner
+template <typename T>
+void XgemvTune(const Arguments<T> &args,
+               const std::vector<T> &a_mat, const std::vector<T> &x_vec, std::vector<T> &y_vec,
+               cltune::Tuner &tuner) {
+
+  // This points to the Xgemv kernel as found in the CLBlast library
+  std::string common_source =
+  #include "../src/kernels/common.opencl"
+  std::string kernel_source =
+  #include "../src/kernels/xgemv.opencl"
+  auto sources = common_source + kernel_source;
+  auto id = tuner.AddKernelFromString(sources, "Xgemv", {args.m}, {1});
+  tuner.SetReferenceFromString(sources, "Xgemv", {args.m}, {64});
+
+  // Sets the tunable parameters and their possible values
+  tuner.AddParameter(id, "WGS", {64, 128});
+  tuner.AddParameter(id, "WPT", {1});
+  tuner.AddParameter(id, "VW", {1});
+
+  // Tests for a specific precision
+  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
+  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));
+
+  // Modifies the thread-sizes (local) based on the parameters
+  tuner.MulLocalSize(id, {"WGS"});
+  tuner.DivGlobalSize(id, {"WPT"});
+  tuner.DivGlobalSize(id, {"VW"});
+
+  // Sets the function's arguments
+  tuner.AddArgumentScalar(static_cast<int>(args.m));
+  tuner.AddArgumentScalar(static_cast<int>(args.n));
+  tuner.AddArgumentScalar(args.alpha);
+  tuner.AddArgumentScalar(args.beta);
+  tuner.AddArgumentInput(a_mat);
+  tuner.AddArgumentInput(x_vec);
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentScalar(1);
+  tuner.AddArgumentOutput(y_vec);
+  tuner.AddArgumentScalar(0);
+  tuner.AddArgumentScalar(1);
+}
+
+// =================================================================================================
+
+// Main function which calls the common client code with the routine-specific function as argument.
+void TunerXgemv(int argc, char *argv[]) {
+  switch(GetPrecision(argc, argv)) {
+    case Precision::kHalf: throw std::runtime_error("Unsupported precision mode");
+    case Precision::kSingle: TunerAXY<float>(argc, argv, XgemvTune<float>); break;
+    case Precision::kDouble: TunerAXY<double>(argc, argv, XgemvTune<double>); break;
+    case Precision::kComplexSingle: TunerAXY<float2>(argc, argv, XgemvTune<float2>); break;
+    case Precision::kComplexDouble: TunerAXY<double2>(argc, argv, XgemvTune<double2>); break;
+  }
+}
+
+// =================================================================================================
+} // namespace clblast
+
+// Main function (not within the clblast namespace)
+int main(int argc, char *argv[]) {
+  clblast::TunerXgemv(argc, argv);
+  return 0;
+}
+
+// =================================================================================================
author	CNugteren <web@cedricnugteren.nl>	2015-06-10 08:44:30 +0200
committer	CNugteren <web@cedricnugteren.nl>	2015-06-10 08:44:30 +0200
commit	85c1db93221bf9d71083c6725a33ccbcd1b61de4 (patch)
tree	e53e4d62d53cc85c4383bccea12904f27c4ac7bc /src
parent	3c17c1c13313022879c8caf289d0f47ea5d7d22d (diff)