From c151ab1325bc796aed386f456258b1b8b05aefa6 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Sat, 30 Sep 2017 20:26:26 +0200
Subject: Refactored the tuning architecture: less duplicate now; more defaults

---
 src/tuning/kernels/xgemm.cpp | 200 ++++++++++++++++++++-----------------------
 1 file changed, 95 insertions(+), 105 deletions(-)

(limited to 'src/tuning/kernels/xgemm.cpp')
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index fa6b3085..7d0f3ed4 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -27,88 +27,111 @@ template <typename T, int V>
 class TuneXgemm {
  public:
 
-  // The representative kernel and the source code
-  static std::string KernelFamily() { return (V==1) ? "xgemm_1" : "xgemm_2"; }
-  static std::string KernelName() { return "Xgemm"; }
-  static std::string GetSources() {
-    return
-      #include "../src/kernels/common.opencl"
-      #include "../src/kernels/level3/xgemm_part1.opencl"
-      #include "../src/kernels/level3/xgemm_part2.opencl"
-      #include "../src/kernels/level3/xgemm_part3.opencl"
-    ;
+  // Settings for this kernel (default command-line arguments)
+  static TunerDefaults GetTunerDefaults() {
+    auto settings = TunerDefaults();
+    settings.options = {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction,
+                        kArgHeuristicSelection, kArgPsoSwarmSize,
+                        kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom};
+    settings.default_m = 1024;
+    settings.default_n = 1024;
+    settings.default_k = 1024;
+    settings.default_fraction = (V==1) ? 1.0 : 512.0; // test all or sample randomly
+    settings.default_num_runs = 2;
+    settings.default_heuristic = static_cast<size_t>(cltune::SearchMethod::RandomSearch);
+    return settings;
   }
 
-  // The list of arguments relevant for this routine
-  static std::vector<std::string> GetOptions() {
-    return {kArgM, kArgN, kArgK, kArgAlpha, kArgBeta, kArgFraction,
-            kArgHeuristicSelection, kArgPsoSwarmSize,
-            kArgPsoInfGlobal, kArgPsoInfLocal, kArgPsoInfRandom};
-  }
+  // Settings for this kernel (general)
+  static TunerSettings GetTunerSettings(const Arguments<T> &args) {
+    auto settings = TunerSettings();
+
+    // Identification of the kernel
+    settings.kernel_family = (V==1) ? "xgemm_1" : "xgemm_2";
+    settings.kernel_name = "Xgemm";
+    settings.sources =
+#include "../src/kernels/common.opencl"
+#include "../src/kernels/level3/xgemm_part1.opencl"
+#include "../src/kernels/level3/xgemm_part2.opencl"
+#include "../src/kernels/level3/xgemm_part3.opencl"
+    ;
 
-  // Tests for valid arguments
-  static void TestValidArguments(const Arguments<T> &) { }
+    // Buffer sizes
+    settings.size_a = args.m * args.k;
+    settings.size_b = args.n * args.k;
+    settings.size_c = args.m * args.n;
+
+    // Sets the base thread configuration
+    settings.global_size = {args.m, args.n};
+    settings.global_size_ref = settings.global_size;
+    settings.local_size = {1, 1};
+    settings.local_size_ref = {8, 8};
 
-  // Sets the default values for the arguments
-  static size_t DefaultM() { return 1024; }
-  static size_t DefaultN() { return 1024; }
-  static size_t DefaultK() { return 1024; }
-  static size_t DefaultBatchCount() { return 1; } // N/A for this kernel
-  static double DefaultFraction() { return (V==1) ? 1.0 : 512.0; } // test all or sample randomly
-  static size_t DefaultNumRuns() { return 2; } // run every kernel this many times for averaging
-  static size_t DefaultSwarmSizePSO() { return 8; } 
-  static double DefaultInfluenceGlobalPSO(){ return 0.1; }
-  static double DefaultInfluenceLocalPSO(){ return 0.3; }
-  static double DefaultInfluenceRandomPSO(){ return 0.6; }
-  static size_t DefaultHeuristic(){ return static_cast<size_t>(cltune::SearchMethod::RandomSearch); }
-  static double DefaultMaxTempAnn(){ return 1.0;}
-  
-  // Describes how to obtain the sizes of the buffers
-  static size_t GetSizeX(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeY(const Arguments<T> &) { return 1; } // N/A for this kernel
-  static size_t GetSizeA(const Arguments<T> &args) { return args.m * args.k; }
-  static size_t GetSizeB(const Arguments<T> &args) { return args.n * args.k; }
-  static size_t GetSizeC(const Arguments<T> &args) { return args.m * args.n; }
-  static size_t GetSizeTemp(const Arguments<T> &) { return 1; } // N/A for this kernel
-
-  // Sets the tuning parameters and their possible values
-  static void SetParameters(cltune::Tuner &tuner, const size_t id) {
+    // Transforms the thread configuration based on the parameters
+    settings.mul_local = {{"MDIMC", "NDIMC"}};
+    settings.mul_global = {{"MDIMC", "NDIMC"}};
+    settings.div_global = {{"MWG", "NWG"}};
+
+    // Sets the tuning parameters and their possible values
     if (V==1) { // limited subset of tuning parameters - but explorable exhaustively
-      tuner.AddParameter(id, "MWG", {16, 32, 64});
-      tuner.AddParameter(id, "NWG", {16, 32, 64});
-      tuner.AddParameter(id, "KWG", {32});
-      tuner.AddParameter(id, "MDIMC", {8, 16, 32});
-      tuner.AddParameter(id, "NDIMC", {8, 16, 32});
-      tuner.AddParameter(id, "MDIMA", {8, 16, 32});
-      tuner.AddParameter(id, "NDIMB", {8, 16, 32});
-      tuner.AddParameter(id, "KWI", {2});
-      tuner.AddParameter(id, "VWM", {1, 2, 4});
-      tuner.AddParameter(id, "VWN", {1, 2, 4});
-      tuner.AddParameter(id, "STRM", {0});
-      tuner.AddParameter(id, "STRN", {0});
-      tuner.AddParameter(id, "SA", {0, 1});
-      tuner.AddParameter(id, "SB", {0, 1});
-    } // a lot more tuning parameters - has to be sampled randomly, too much to test all
+      settings.parameters = {
+        {"MWG", {16, 32, 64}},
+        {"NWG", {16, 32, 64}},
+        {"KWG", {32}},
+        {"MDIMC", {8, 16, 32}},
+        {"NDIMC", {8, 16, 32}},
+        {"MDIMA", {8, 16, 32}},
+        {"NDIMB", {8, 16, 32}},
+        {"KWI", {2}},
+        {"VWM", {1, 2, 4}},
+        {"VWN", {1, 2, 4}},
+        {"STRM", {0}},
+        {"STRN", {0}},
+        {"SA", {0, 1}},
+        {"SB", {0, 1}},
+      };
+    }
+    else { // a lot more tuning parameters - has to be sampled randomly, too much to test all
+      settings.parameters = {
+        {"MWG", {16, 32, 64, 128}},
+        {"NWG", {16, 32, 64, 128}},
+        {"KWG", {16, 32}},
+        {"MDIMC", {8, 16, 32}},
+        {"NDIMC", {8, 16, 32}},
+        {"MDIMA", {8, 16, 32}},
+        {"NDIMB", {8, 16, 32}},
+        {"KWI", {2}},
+        {"VWM", {1, 2, 4, 8}},
+        {"VWN", {1, 2, 4, 8}},
+        {"STRM", {0, 1}},
+        {"STRN", {0, 1}},
+        {"SA", {0, 1}},
+        {"SB", {0, 1}},
+      };
+    }
+
+    // Describes how to compute the performance metrics
+    settings.metric_amount = 2 * args.m * args.n * args.k;
+    settings.performance_unit = "GFLOPS";
+
+    // Returns which search heuristic to use
+    if (V==1) { settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch); }
     else {
-      //RANDOM_SEARCH & PSO
-      tuner.AddParameter(id, "MWG", {16, 32, 64, 128});
-      tuner.AddParameter(id, "NWG", {16, 32, 64, 128});
-      tuner.AddParameter(id, "KWG", {16, 32});
-      tuner.AddParameter(id, "MDIMC", {8, 16, 32});
-      tuner.AddParameter(id, "NDIMC", {8, 16, 32});
-      tuner.AddParameter(id, "MDIMA", {8, 16, 32});
-      tuner.AddParameter(id, "NDIMB", {8, 16, 32});
-      tuner.AddParameter(id, "KWI", {2});
-      tuner.AddParameter(id, "VWM", {1, 2, 4, 8});
-      tuner.AddParameter(id, "VWN", {1, 2, 4, 8});
-      tuner.AddParameter(id, "STRM", {0, 1});
-      tuner.AddParameter(id, "STRN", {0, 1});
-      tuner.AddParameter(id, "SA", {0, 1});
-      tuner.AddParameter(id, "SB", {0, 1});
+      // Use full-search to explore all parameter combinations or another strategy to search only a
+      // part of the parameter values. The fraction is set as a command-line argument.
+      if (args.fraction == 1.0 || args.fraction == 0.0) {
+        settings.heuristic = static_cast<size_t>(cltune::SearchMethod::FullSearch);
+      } else {
+        settings.heuristic = args.heuristic_selection;
+      }
     }
 
+    return settings;
   }
 
+  // Tests for valid arguments
+  static void TestValidArguments(const Arguments<T> &) { }
+
   // Sets the constraints
   static void SetConstraints(cltune::Tuner &tuner, const size_t id) {
     auto MultipleOfX = [] (std::vector<size_t> v) { return IsMultiple(v[0], v[1]); };
@@ -144,19 +167,6 @@ class TuneXgemm {
                                                     "SB", "KWG", "NWG"});
   }
 
-  // Sets the base thread configuration
-  static std::vector<size_t> GlobalSize(const Arguments<T> &args) { return {args.m, args.n}; }
-  static std::vector<size_t> GlobalSizeRef(const Arguments<T> &args) { return GlobalSize(args); }
-  static std::vector<size_t> LocalSize() { return {1, 1}; }
-  static std::vector<size_t> LocalSizeRef() { return {8, 8}; }
-
-  // Transforms the thread configuration based on the parameters
-  using TransformVector = std::vector<std::vector<std::string>>;
-  static TransformVector MulLocal() { return {{"MDIMC", "NDIMC"}}; }
-  static TransformVector DivLocal() { return {}; }
-  static TransformVector MulGlobal() { return {{"MDIMC", "NDIMC"}}; }
-  static TransformVector DivGlobal() { return {{"MWG", "NWG"}}; }
-
   // Sets the kernel's arguments
   static void SetArguments(cltune::Tuner &tuner, const Arguments<T> &args,
                            std::vector<T> &, std::vector<T> &,
@@ -171,26 +181,6 @@ class TuneXgemm {
     tuner.AddArgumentInput(b_mat);
     tuner.AddArgumentOutput(c_mat);
   }
-
-  // Describes how to compute the performance metrics
-  static size_t GetMetric(const Arguments<T> &args) {
-    return 2 * args.m * args.n * args.k;
-  }
-  static std::string PerformanceUnit() { return "GFLOPS"; }
- 
-  // Returns which Heuristic to run 
-  static size_t GetHeuristic(const Arguments<T> &args){
-    if (V==1) { return static_cast<size_t>(cltune::SearchMethod::FullSearch); }
-    else {
-      // Use full-search to explore all parameter combinations or another strategy to search only a
-      // part of the parameter values. The fraction is set as a command-line argument.
-      if (args.fraction == 1.0 || args.fraction == 0.0) {
-        return static_cast<size_t>(cltune::SearchMethod::FullSearch);
-      } else {
-        return args.heuristic_selection;
-      }
-    }
-  } 
 };
 
 // =================================================================================================
-- 
cgit v1.2.3


From 375193fe4e72b320eb63fbc6f98c24714f6970c2 Mon Sep 17 00:00:00 2001
From: Cedric Nugteren <web@cedricnugteren.nl>
Date: Tue, 3 Oct 2017 21:55:21 +0200
Subject: Gemm in-direct implementation now uses only 1 larger instead of max 3
 optional temporary buffers

---
 CHANGELOG                             |  1 +
 src/kernels/level3/xgemm_part3.opencl |  9 +++++++--
 src/routines/level3/xgemm.cpp         | 30 +++++++++++++++++++++++-------
 src/tuning/kernels/xgemm.cpp          |  2 ++
 4 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'src/tuning/kernels/xgemm.cpp')

diff --git a/CHANGELOG b/CHANGELOG
index 62f356a1..bb2013a6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,7 @@
 
 Development (next version)
 - Kernels are now cached based on their tuning parameters: fits the use-case of 'OverrideParameters'
+- Improved performance for small GEMM problems by going from 3 to 1 optional temporary buffers
 - Various minor fixes and enhancements
 
 Version 1.1.0
diff --git a/src/kernels/level3/xgemm_part3.opencl b/src/kernels/level3/xgemm_part3.opencl
index 3f0d590d..f447677f 100644
--- a/src/kernels/level3/xgemm_part3.opencl
+++ b/src/kernels/level3/xgemm_part3.opencl
@@ -17,7 +17,7 @@ R"(
 
 // =================================================================================================
 
-// Main body of the matrix-multiplication algorithm. It calls the (inlined) functions above.
+// Main body of the matrix-multiplication algorithm. It calls various (inlined) functions.
 INLINE_FUNC void XgemmBody(const int kSizeM, const int kSizeN, const int kSizeK,
                            const __global realM* restrict agm, const __global realN* restrict bgm,
                            __global realM* cgm, realM cpm[NWI][MWI/VWM]
@@ -192,10 +192,15 @@ void Xgemm(const int kSizeM, const int kSizeN, const int kSizeK,
            const real_arg arg_beta,
            const __global realM* restrict agm,
            const __global realN* restrict bgm,
-           __global realM* cgm) {
+           __global realM* cgm,
+           const int b_offset, const int c_offset) {
   const real alpha = GetRealArg(arg_alpha);
   const real beta = GetRealArg(arg_beta);
 
+  // Adds the offsets (in case of use of a single temporary buffer for A, B, and C)
+  bgm = &bgm[b_offset];
+  cgm = &cgm[c_offset];
+
   // Allocates workgroup-private memory (local memory)
   #if SA == 1
     __local realM alm[KWG * MWG/VWM];
diff --git a/src/routines/level3/xgemm.cpp b/src/routines/level3/xgemm.cpp
index 3909c308..253976e1 100644
--- a/src/routines/level3/xgemm.cpp
+++ b/src/routines/level3/xgemm.cpp
@@ -161,10 +161,24 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
   auto c_no_temp = c_one == c_one_i && c_two == c_two_i && c_ld == c_one && c_offset == 0 &&
                    c_do_transpose == false;
 
-  // Creates the temporary matrices
-  const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, a_one_i*a_two_i);
-  const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, b_one_i*b_two_i);
-  const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, c_one_i*c_two_i);
+  // Computes the sizes and offsets for (optional) temporary buffers for the 3 matrices
+  auto temp_size = size_t{0};
+  auto b_temp_offset = size_t{0};
+  auto c_temp_offset = size_t{0};
+  if (!a_no_temp) { temp_size += a_one_i*a_two_i; }
+  if (!b_no_temp) { b_temp_offset = temp_size; temp_size += b_one_i*b_two_i; }
+  if (!c_no_temp) { c_temp_offset = temp_size; temp_size += c_one_i*c_two_i; }
+  if (!IsMultiple(b_temp_offset, db_["VWN"])) { throw BLASError(StatusCode::kUnexpectedError); }
+  if (!IsMultiple(b_temp_offset, db_["VWM"])) { throw BLASError(StatusCode::kUnexpectedError); }
+
+  // Creates the buffer for the (optional) temporary matrices. Note that we use 'a_buffer' in case
+  // when no temporary buffer is needed, but that's just to make it compile: it is never used.
+  const auto temp_buffer = (temp_size > 0) ? Buffer<T>(context_, temp_size) : a_buffer;
+
+  // Sets the buffer pointers for (temp) matrices A, B, and C
+  const auto a_temp = (a_no_temp) ? a_buffer : temp_buffer;
+  const auto b_temp = (b_no_temp) ? b_buffer : temp_buffer;
+  const auto c_temp = (c_no_temp) ? c_buffer : temp_buffer;
 
   // Events of all kernels (including pre/post processing kernels)
   auto eventWaitList = std::vector<Event>();
@@ -188,7 +202,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
     auto eventProcessB = Event();
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessB.pointer(), emptyEventList,
                            b_one, b_two, b_ld, b_offset, b_buffer,
-                           b_one_i, b_two_i, b_one_i, 0, b_temp,
+                           b_one_i, b_two_i, b_one_i, b_temp_offset, b_temp,
                            ConstantOne<T>(), program_,
                            true, b_do_transpose, b_conjugate);
     eventWaitList.push_back(eventProcessB);
@@ -199,7 +213,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
     auto eventProcessC = Event();
     PadCopyTransposeMatrix(queue_, device_, db_, eventProcessC.pointer(), emptyEventList,
                            c_one, c_two, c_ld, c_offset, c_buffer,
-                           c_one_i, c_two_i, c_one_i, 0, c_temp,
+                           c_one_i, c_two_i, c_one_i, c_temp_offset, c_temp,
                            ConstantOne<T>(), program_,
                            true, c_do_transpose, false);
     eventWaitList.push_back(eventProcessC);
@@ -217,6 +231,8 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
   kernel.SetArgument(5, a_temp());
   kernel.SetArgument(6, b_temp());
   kernel.SetArgument(7, c_temp());
+  kernel.SetArgument(8, static_cast<int>(b_temp_offset / db_["VWN"]));
+  kernel.SetArgument(9, static_cast<int>(c_temp_offset / db_["VWM"]));
 
   // Computes the global and local thread sizes
   const auto global = std::vector<size_t>{
@@ -234,7 +250,7 @@ void Xgemm<T>::GemmIndirect(const size_t m, const size_t n, const size_t k,
   if (!c_no_temp) {
     eventWaitList.push_back(eventKernel);
     PadCopyTransposeMatrix(queue_, device_, db_, event_, eventWaitList,
-                           c_one_i, c_two_i, c_one_i, 0, c_temp,
+                           c_one_i, c_two_i, c_one_i, c_temp_offset, c_temp,
                            c_one, c_two, c_ld, c_offset, c_buffer,
                            ConstantOne<T>(), program_,
                            false, c_do_transpose, false);
diff --git a/src/tuning/kernels/xgemm.cpp b/src/tuning/kernels/xgemm.cpp
index 7d0f3ed4..6dcdf68b 100644
--- a/src/tuning/kernels/xgemm.cpp
+++ b/src/tuning/kernels/xgemm.cpp
@@ -180,6 +180,8 @@ class TuneXgemm {
     tuner.AddArgumentInput(a_mat);
     tuner.AddArgumentInput(b_mat);
     tuner.AddArgumentOutput(c_mat);
+    tuner.AddArgumentScalar(0);
+    tuner.AddArgumentScalar(0);
   }
 };
 
-- 
cgit v1.2.3