Merge branch 'master' into CLBlast-267-convgemm

author: Cedric Nugteren <web@cedricnugteren.nl> 2018-05-19 17:54:27 +0200
committer: Cedric Nugteren <web@cedricnugteren.nl> 2018-05-19 17:54:27 +0200
commit: cbcd4ff7e8e21584a9a1f405c9f4cb979a73b718 (patch)
tree: 4a131ed480dc4f496a211453f95adfebaf3f6336
parent: e057a9186a1ed0a169fcf4db7a2598d08f530834 (diff)
parent: 507d7bc729eff888dd499e937bf1a636cbdee75b (diff)
25 files changed, 169 insertions, 93 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 5f3ef371..c86ab70b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,8 +6,11 @@ Development (next version)
 - Improved the performance potential by adding a second tunable GEMM kernel with 2D register tiling
 - Added support for Intel specific subgroup shuffling extensions for faster GEMM on Intel GPUs
 - Re-added a local memory size constraint to the tuners
+- The routine tuners now automatically pick up tuning results from disk from the kernel tuners
 - Updated and reorganised the CLBlast documentation
+- Added a 'canary' region to check for overflows in the tuner and tests (insipred by clARMOR)
 - Fixed an access violation when compiled with Visual Studio upon releasing the OpenCL program
+- Fixed incorrect releasing of the OpenCL program resulting in segfaults / access violations
 - Various minor fixes and enhancements
 - Added tuned parameters for various devices (see doc/tuning.md)
 - Added non-BLAS level-1 routines:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4974545e..72aaa533 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -424,9 +424,9 @@ if(TUNERS)
     install(TARGETS clblast_tuner_${KERNEL} DESTINATION bin)
   endforeach()
   foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
-    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp)
+    add_executable(clblast_tuner_routine_${ROUTINE_TUNER} ${TUNERS_COMMON} src/tuning/routines/${ROUTINE_TUNER}.cpp test/test_utilities.cpp)
     target_link_libraries(clblast_tuner_routine_${ROUTINE_TUNER} clblast)
-    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS})
+    target_include_directories(clblast_tuner_routine_${ROUTINE_TUNER} PUBLIC $<TARGET_PROPERTY:clblast,INTERFACE_INCLUDE_DIRECTORIES> ${API_INCLUDE_DIRS} ${clblast_SOURCE_DIR})
     install(TARGETS clblast_tuner_routine_${ROUTINE_TUNER} DESTINATION bin)
   endforeach()
 
@@ -439,6 +439,12 @@ if(TUNERS)
     endforeach()
     set(ALLTUNERSDEPENDS clblast_tuner_${KERNEL})
   endforeach()
+  foreach(ROUTINE_TUNER ${ROUTINE_TUNERS})
+    foreach(PRECISION ${PRECISIONS})
+      set(ALLTUNERS ${ALLTUNERS} COMMAND clblast_tuner_routine_${ROUTINE_TUNER} -precision ${PRECISION})
+    endforeach()
+    set(ALLTUNERSDEPENDS clblast_tuner_routine_${ROUTINE_TUNER})
+  endforeach()
   add_custom_target(alltuners ${ALLTUNERS} DEPENDS ${ALLTUNERSDEPENDS})
 
 endif()
diff --git a/README.md b/README.md
index e4555359..0070a84c 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@ CLBlast: The tuned OpenCL BLAS library
 
 | | Build status | Tests on Intel CPU | Tests on NVIDIA GPU | Tests on Intel GPU |
 |-----|-----|-----|-----|-----|
-| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://67.207.87.39:8010/#/builders/106) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-nvidia-k5000.svg)](http://67.207.87.39:8010/#/builders/105) | [![Build Status](http://67.207.87.39:8010/badges/clblast-windows-intel-HD4600.svg)](http://67.207.87.39:8010/#/builders/107) |
-| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://67.207.87.39:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://67.207.87.39:8010/#/builders/97) | [![Build Status](http://67.207.87.39:8010/badges/clblast-linux-nvidia-k80.svg)](http://67.207.87.39:8010/#/builders/98) | N/A |
-| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |  [![Build Status](http://67.207.87.39:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://67.207.87.39:8010/#/builders/110) | N/A | N/A |
+| Windows | [![Build Status](https://ci.appveyor.com/api/projects/status/github/cnugteren/clblast?branch=master&svg=true)](https://ci.appveyor.com/project/CNugteren/clblast) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-i7-4790k.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-i7-4790k) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-nvidia-k5000.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-nvidia-k5000) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-windows-intel-HD4600.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-windows-intel-HD4600) |
+| Linux | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-intel-e5-2620-v4.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-intel-e5-2620-v4) | [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-linux-nvidia-k80.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-linux-nvidia-k80) | N/A |
+| OS X | [![Build Status](https://travis-ci.org/CNugteren/CLBlast.svg?branch=master)](https://travis-ci.org/CNugteren/CLBlast/branches) |  [![Build Status](http://ci.arrayfire.org:8010/badges/clblast-osx-intel-i5-4278U.svg)](http://ci.arrayfire.org:8010/#/builders/clblast-osx-intel-i5-4278U) | N/A | N/A |
 
 CLBlast is a modern, lightweight, performant and tunable OpenCL BLAS library written in C++11. It is designed to leverage the full performance potential of a wide variety of OpenCL devices from different vendors, including desktop and laptop GPUs, embedded GPUs, and other accelerators. CLBlast implements BLAS routines: basic linear algebra subprograms operating on vectors and matrices. See [the CLBlast website](https://cnugteren.github.io/clblast) for performance reports on various devices as well as the latest CLBlast news.
 
@@ -78,6 +78,7 @@ More detailed documentation is available in separate files:
 * [Tuning for better performance](doc/tuning.md)
 * [Testing the library for correctness](doc/testing.md)
 * [Bindings / wrappers for other languages](doc/bindings.md)
+* [More details on the GEMM kernel](doc/details_gemm.md)
 * [Glossary with some terms explained](doc/glossary.md)
 
 
@@ -133,6 +134,7 @@ Tuning and testing on a variety of OpenCL devices was made possible by:
 
 Hardware/software for this project was contributed by:
 
+* [HPC research group at the University of Bristol](http://uob-hpc.github.io/zoo/) for access to their GPU zoo
 * [ArrayFire](http://arrayfire.org) for settings up and supporting Buildbot correctness tests on multiple platforms
 * [JetBrains](https://www.jetbrains.com/clion/) for supply a free CLion IDE license for CLBlast developers
 * [Travis CI](https://travis-ci.org/CNugteren/CLBlast/branches) and [AppVeyor](https://ci.appveyor.com/project/CNugteren/clblast) for free automated build tests for open-source projects
@@ -143,8 +145,8 @@ More information
 
 Further information on CLBlast is available through the following links:
 
-* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf).
-* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (May 2017, updated April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
+* A 20-minute presentation of CLBlast was given at the GPU Technology Conference in May 2017. A recording is available on the [GTC on-demand website](http://on-demand.gputechconf.com/gtc/2017/video/s7280-nugteren-clblast.mp4) (poor audio quality however) and a full slide-set is also available [as PDF](http://on-demand.gputechconf.com/gtc/2017/presentation/s7280-cedric-nugteren-clblast.pdf). An updated version was also presented at IWOCL in May 2018. The slide set can be found [here as PDF](https://cnugteren.github.io/downloads/CLBlastIWOCL18.pdf).
+* More in-depth information and experimental results are also available in a scientific paper titled [CLBlast: A Tuned OpenCL BLAS Library](https://arxiv.org/abs/1705.05249) (v1 May 2017, updated to v2 in April 2018). For CLTune, the inspiration for the included auto-tuner, see also the [CLTune: A Generic Auto-Tuner for OpenCL Kernels](https://arxiv.org/abs/1703.06503) paper.
 
 How to cite this work:
 
diff --git a/ROADMAP.md b/ROADMAP.md
index 3be62501..9b870523 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -18,7 +18,8 @@ This file gives an overview of the main features planned for addition to CLBlast
 | [#223](https://github.com/CNugteren/CLBlast/issues/223)        | Feb '18     | CNugteren | ✔      | Python OpenCL interface |
 | [#237](https://github.com/CNugteren/CLBlast/issues/237)        | Mar '18     | CNugteren | ✔      | Making tuning possible from the CLBlast API |
 | [#228](https://github.com/CNugteren/CLBlast/issues/228)        | Mar-Apr '18 | CNugteren | ✔      | Improving performance for Qualcomm Adreno GPUs |
-| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | May '18     | CNugteren |        | Implement col2im |
 | [#267](https://github.com/CNugteren/CLBlast/issues/267)        | May '18     | CNugteren |        | Merge im2col and GEMM into a direct kernel |
+| [#270](https://github.com/CNugteren/CLBlast/issues/270)        | July '18    | CNugteren |        | Implement col2im |
+| -                                                              | July '18    | CNugteren |        | Add a SYCL interface to the library |
 | [#136](https://github.com/CNugteren/CLBlast/issues/136)        | ??          | CNugteren |        | Implement xAXPBY and xSET |
 | [#169](https://github.com/CNugteren/CLBlast/issues/169)        | ??          | dividiti  |        | Problem-specific tuning parameter selection |
diff --git a/doc/details_gemm.md b/doc/details_gemm.md
new file mode 100644
index 00000000..d4666abb
--- /dev/null
+++ b/doc/details_gemm.md
@@ -0,0 +1,27 @@
+CLBlast: Details on the GEMM routine and kernel
+================
+
+This document gives a bit more detail on how the GEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md).
+
+
+GEMM: Two approaches
+-------------
+
+CLBlast implements two approaches to GEMM: direct and indirect:
+
+* Direct GEMM: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes).
+* Indirect GEMM: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices.
+
+
+GEMM: In-direct approach
+-------------
+
+Similar to the work by Matsumoto et al. ("Performance Tuning of Matrix Multiplication in OpenCL on Different GPUs and CPUs"), the main GEMM kernel makes many assumptions on the input arguments, which are handled by pre-processing and post-processing kernels. These assumptions are e.g. matrix sizes are a multiple of the work-group sizes, offsets are zero, and matrix B is transposed. This is a good solution for larger problem sizes since O(n^2) data movement is typically cheaper than O(n^3) computation, but the hidden constant starts to play a role for smaller n. Therefore, there is also a single-kernel direct version available for those cases, but it shares most of the design and parameters as discussed below.
+
+The main kernel has 14 different parameters, of which some are illustrated in figure 1 in the [CLBlast paper](https://arxiv.org/pdf/1705.05249). The parameters define among others the work-group sizes in 2 dimensions (MWG, NWG), the 2D register tiling configuration (MWI, NWI), the vector widths of both input matrices (VWM, VWN), loop unroll factors (KWI), and whether or not and how to use the local memory.
+
+
+GEMM: Direct approach
+-------------
+
+This is a single-kernel approach that shared many of the parameters for the in-direct kernel. One of the differences is that within the kernel there are checks for incomplete tiles in the m/n/k dimensions, influenced by the tuning parameters and the matrix sizes. These incomplete tiles will run a different part of the code, as they for example cannot benefit from vectorisation. Another difference is that there are dedicated kernels for each a/b transpose requirement: NN, NT, TN, TT for non-transposed and transposed.
+\ No newline at end of file
diff --git a/doc/tuning.md b/doc/tuning.md
index 60ad2cc7..b5186ac6 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -82,7 +82,7 @@ Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clbla
 
 The kernels `gemm` and `gemm_direct` have too many parameters to explore. Therefore, they will run in two stages: a first stage with a fixed limited number of parameter combinations, and a second stage with a random selection from a much larger search space. The random fraction is determined by the `fraction` argument on the command-line.
 
-There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
+There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
 
 
 Using the tuning results
@@ -100,8 +100,6 @@ In summary, tuning the entire library for your device can be done as follows (st
     python ../scripts/database/database.py . ..
     make
 
-After the kernels are tuned, you can run the `clblast_tuner_routine_xgemm` tuner to optimize the high-level GEMM routine, i.e. selecting which method to use: the direct kernel or the in-direct kernel.
-
 
 Tuning using the API (advanced users only)
 -------------
diff --git a/src/cache.cpp b/src/cache.cpp
index 4b74b0a1..e15a72a5 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -117,8 +117,8 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
 
 // =================================================================================================
 
-template class Cache<ProgramKey, Program>;
-template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+template class Cache<ProgramKey, std::shared_ptr<Program>>;
+template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
 template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
 
 // =================================================================================================
diff --git a/src/cache.hpp b/src/cache.hpp
index 228fbccb..89973f61 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -83,10 +83,10 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const
 typedef std::tuple<RawContext, RawDeviceID, Precision, std::string> ProgramKey;
 typedef std::tuple<const RawContext &, const RawDeviceID &, const Precision &, const std::string &> ProgramKeyRef;
 
-typedef Cache<ProgramKey, Program> ProgramCache;
+typedef Cache<ProgramKey, std::shared_ptr<Program>> ProgramCache;
 
-extern template class Cache<ProgramKey, Program>;
-extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+extern template class Cache<ProgramKey, std::shared_ptr<Program>>;
+extern template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
 
 // =================================================================================================
 
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index c4b721b9..ce6f39cb 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -437,47 +437,41 @@ using ContextPointer = cl_context*;
 // C++11 version of 'cl_program'.
 class Program {
  public:
-  Program() = default;
 
   // Source-based constructor with memory management
-  explicit Program(const Context &context, const std::string &source):
-      program_(new cl_program, [](cl_program* p) {
-        #ifndef _MSC_VER // 'clReleaseProgram' caused an access violation with Visual Studio
-          if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
-        #endif
-        delete p;
-      }) {
+  explicit Program(const Context &context, const std::string &source) {
     const char *source_ptr = &source[0];
     const auto length = source.length();
     auto status = CL_SUCCESS;
-    *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
+    program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
     CLCudaAPIError::Check(status, "clCreateProgramWithSource");
   }
 
   // Binary-based constructor with memory management
-  explicit Program(const Device &device, const Context &context, const std::string &binary):
-      program_(new cl_program, [](cl_program* p) {
-        if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
-        delete p;
-      }) {
+  explicit Program(const Device &device, const Context &context, const std::string &binary) {
     const char *binary_ptr = &binary[0];
     const auto length = binary.length();
     auto status1 = CL_SUCCESS;
     auto status2 = CL_SUCCESS;
     const auto dev = device();
-    *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
+    program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
                                           reinterpret_cast<const unsigned char**>(&binary_ptr),
                                           &status1, &status2);
     CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
     CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
   }
 
+  // Clean-up
+  ~Program() {
+    if (program_) { CheckErrorDtor(clReleaseProgram(program_)); }
+  }
+
   // Compiles the device program and checks whether or not there are any warnings/errors
   void Build(const Device &device, std::vector<std::string> &options) {
     options.push_back("-cl-std=CL1.1");
     auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
     const cl_device_id dev = device();
-    CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
+    CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
   }
 
   // Confirms whether a certain status code is an actual compilation error or warning
@@ -489,28 +483,28 @@ class Program {
   std::string GetBuildInfo(const Device &device) const {
     auto bytes = size_t{0};
     auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
-    CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
+    CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes));
     auto result = std::string{};
     result.resize(bytes);
-    CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
+    CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr));
     return result;
   }
 
   // Retrieves a binary or an intermediate representation of the compiled program
   std::string GetIR() const {
     auto bytes = size_t{0};
-    CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
     auto result = std::string{};
     result.resize(bytes);
     auto result_ptr = result.data();
-    CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
+    CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
     return result;
   }
 
   // Accessor to the private data-member
-  const cl_program& operator()() const { return *program_; }
+  const cl_program& operator()() const { return program_; }
  private:
-  std::shared_ptr<cl_program> program_;
+  cl_program program_ = nullptr;
 };
 
 // =================================================================================================
@@ -757,13 +751,13 @@ class Kernel {
   }
 
   // Regular constructor with memory management
-  explicit Kernel(const Program &program, const std::string &name):
+  explicit Kernel(const std::shared_ptr<Program> program, const std::string &name):
       kernel_(new cl_kernel, [](cl_kernel* k) {
         if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
         delete k;
       }) {
     auto status = CL_SUCCESS;
-    *kernel_ = clCreateKernel(program(), name.c_str(), &status);
+    *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
     CLCudaAPIError::Check(status, "clCreateKernel");
   }
 
diff --git a/src/routine.cpp b/src/routine.cpp
index fa5934f6..4caa4d7b 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -96,10 +96,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
   auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id,  precision_, routine_info, device_name },
                                             &has_binary);
   if (has_binary) {
-    program_ = Program(device_, context_, binary);
-    program_.Build(device_, options);
+    program_ = std::make_shared<Program>(Program(device_, context_, binary));
+    program_->Build(device_, options);
     ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info },
-                                   Program{ program_ });
+                                    std::shared_ptr<Program>{program_});
     return;
   }
 
@@ -135,10 +135,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
 
   // Store the compiled binary and program in the cache
   BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
-                                program_.GetIR());
+                                program_->GetIR());
 
   ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
-                                 Program{ program_ });
+                                 std::shared_ptr<Program>{program_});
 }
 
 // =================================================================================================
diff --git a/src/routine.hpp b/src/routine.hpp
index 00f7b5cc..8db5e5a9 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -33,6 +33,7 @@ namespace clblast {
 class Routine {
  public:
 
+  // Initializes db_, fetching cached database or building one
   static void InitDatabase(const Device &device, const std::vector<std::string> &kernel_names,
                            const Precision precision, const std::vector<database::DatabaseEntry> &userDatabase,
                            Databases &db) {
@@ -78,9 +79,6 @@ class Routine {
   // Initializes program_, fetching cached program or building one
   void InitProgram(std::initializer_list<const char *> source);
 
-  // Initializes db_, fetching cached database or building one
-  void InitDatabase(const std::vector<database::DatabaseEntry> &userDatabase);
-
  protected:
 
   // Non-static variable for the precision
@@ -97,7 +95,7 @@ class Routine {
   const Device device_;
 
   // Compiled program (either retrieved from cache or compiled in slow path)
-  Program program_;
+  std::shared_ptr<Program> program_;
 
   // Connection to the database for all the device-specific parameters
   Databases db_;
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index a4d1f577..5b80e3f2 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -77,7 +77,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Sets all elements of a matrix to a constant value
 template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t m, const size_t n, const size_t ld, const size_t offset,
                 const Buffer<T> &dest,
@@ -95,26 +95,26 @@ void FillMatrix(Queue &queue, const Device &device,
 }
 
 // Compiles the above function
-template void FillMatrix<half>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                const size_t, const size_t, const Buffer<half>&, const half);
-template void FillMatrix<float>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                 const size_t, const size_t, const Buffer<float>&, const float);
-template void FillMatrix<double>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                  const size_t, const size_t, const Buffer<double>&, const double);
-template void FillMatrix<float2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                  const size_t, const size_t, const Buffer<float2>&, const float2);
-template void FillMatrix<double2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                   EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                   const size_t, const size_t, const Buffer<double2>&, const double2);
 
 // Sets all elements of a vector to a constant value
 template <typename T>
 void FillVector(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t n, const size_t inc, const size_t offset,
                 const Buffer<T> &dest,
@@ -131,19 +131,19 @@ void FillVector(Queue &queue, const Device &device,
 }
 
 // Compiles the above function
-template void FillVector<half>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                const size_t, const Buffer<half>&, const half);
-template void FillVector<float>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                 EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                 const size_t, const Buffer<float>&, const float);
-template void FillVector<double>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                  const size_t, const Buffer<double>&, const double);
-template void FillVector<float2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                  EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                  const size_t, const Buffer<float2>&, const float2);
-template void FillVector<double2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
                                   EventPointer, const std::vector<Event>&, const size_t, const size_t,
                                   const size_t, const Buffer<double2>&, const double2);
 
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 6cbe1e1b..b909243d 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -36,7 +36,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
 // Sets all elements of a matrix to a constant value
 template <typename T>
 void FillMatrix(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t m, const size_t n, const size_t ld, const size_t offset,
                 const Buffer<T> &dest,
@@ -45,7 +45,7 @@ void FillMatrix(Queue &queue, const Device &device,
 // Sets all elements of a vector to a constant value
 template <typename T>
 void FillVector(Queue &queue, const Device &device,
-                const Program &program, const Databases &,
+                const std::shared_ptr<Program> program, const Databases &,
                 EventPointer event, const std::vector<Event> &waitForEvents,
                 const size_t n, const size_t inc, const size_t offset,
                 const Buffer<T> &dest,
@@ -66,7 +66,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
                             const size_t dest_ld, const size_t dest_offset,
                             const Buffer<T> &dest,
                             const T alpha,
-                            const Program &program, const bool do_pad,
+                            const std::shared_ptr<Program> program, const bool do_pad,
                             const bool do_transpose, const bool do_conjugate,
                             const bool upper = false, const bool lower = false,
                             const bool diagonal_imag_zero = false) {
@@ -186,7 +186,7 @@ void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
                                    const size_t dest_one, const size_t dest_two,
                                    const size_t dest_ld, const Buffer<int> &dest_offsets,
                                    const Buffer<T> &dest,
-                                   const Program &program, const bool do_pad,
+                                   const std::shared_ptr<Program> program, const bool do_pad,
                                    const bool do_transpose, const bool do_conjugate,
                                    const size_t batch_count) {
 
@@ -250,7 +250,7 @@ void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device,
                                           const size_t dest_one, const size_t dest_two,
                                           const size_t dest_ld, const size_t dest_offset,
                                           const size_t dest_stride, const Buffer<T> &dest,
-                                          const Program &program, const bool do_pad,
+                                          const std::shared_ptr<Program> program, const bool do_pad,
                                           const bool do_transpose, const bool do_conjugate,
                                           const size_t batch_count) {
 
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 0721ad7c..92aab611 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -15,8 +15,10 @@
 #include <exception>
 #include <string>
 #include <vector>
+#include <iostream>
 
 #include "utilities/utilities.hpp"
+#include "test/test_utilities.hpp"
 #include "tuning/routines/routine_tuner.hpp"
 
 namespace clblast {
@@ -101,6 +103,22 @@ void TuneXgemm(int argc, char* argv[]) {
   const auto context = Context(device);
   auto queue = Queue(context, device);
 
+  // Pre-load GEMM kernel tuning results if they exist
+  printf("* The GEMM routine tuner requires already tuned kernels\n");
+  printf("  Applying tuning results from disk if they exist...\n\n");
+  const auto kernel_names = {"xgemm_1", "xgemm_direct_1", "copy", "pad", "transpose", "padtranspose"};
+  for (const auto& kernel_name : kernel_names) {
+    const auto tuner_file_name = "clblast_" + std::string{kernel_name} + "_" +
+                                 ToString(static_cast<int>(precision)) + ".json";
+    printf("* Looking for tuning results in the current folder: '%s'\n", tuner_file_name.c_str());
+    if (std::ifstream(tuner_file_name)) { // Checks if the file exists on disk
+      OverrideParametersFromJSONFiles({tuner_file_name}, device(), precision);
+    }
+    else {
+      printf("  Not found: assuming the kernel '%s' is already tuned\n\n", kernel_name);
+    }
+  }
+
   // Run the tuners for the XGEMM routines
   TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
                          64, 2048, 64, 1, num_runs,
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index dd4a83e6..216f4b31 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -150,11 +150,11 @@ void Tuner(int argc, char* argv[], const int V,
   const auto device_architecture = GetDeviceArchitecture(device);
   const auto device_name = GetDeviceName(device);
 
-  // Creates input buffers with random data
+  // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
   const auto buffer_sizes = std::vector<size_t>{
-      settings.size_x, settings.size_y,
-      settings.size_a, settings.size_b, settings.size_c,
-      settings.size_temp
+      settings.size_x + kCanarySize, settings.size_y + kCanarySize,
+      settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
+      settings.size_temp + kCanarySize
   };
   std::mt19937 mt(kSeed);
   std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp
index f1da40c1..2eec2e2e 100644
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@@ -241,11 +241,11 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
   const auto device_architecture = GetDeviceArchitecture(device);
   const auto device_name = GetDeviceName(device);
 
-  // Creates input buffers with random data
+  // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
   const auto buffer_sizes = std::vector<size_t>{
-      settings.size_x, settings.size_y,
-      settings.size_a, settings.size_b, settings.size_c,
-      settings.size_temp
+      settings.size_x + kCanarySize, settings.size_y + kCanarySize,
+      settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
+      settings.size_temp + kCanarySize
   };
   const auto seed = static_cast<unsigned long>(time(nullptr));
   std::mt19937 mt(seed);
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 65131cca..05c29944 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -21,7 +21,8 @@ namespace clblast {
 // =================================================================================================
 
 // Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
+std::shared_ptr<Program> CompileFromSource(
+                          const std::string &source_string, const Precision precision,
                           const std::string &routine_name,
                           const Device& device, const Context& context,
                           std::vector<std::string>& options,
@@ -93,13 +94,13 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
   }
 
   // Compiles the kernel
-  auto program = Program(context, kernel_string);
+  auto program = std::make_shared<Program>(context, kernel_string);
   try {
-    program.Build(device, options);
+    program->Build(device, options);
   } catch (const CLCudaAPIBuildError &e) {
-    if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+    if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
       fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
-              program.GetBuildInfo(device).c_str());
+              program->GetBuildInfo(device).c_str());
     }
     throw;
   }
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
index 1b4f4a7a..13e8c363 100644
--- a/src/utilities/compile.hpp
+++ b/src/utilities/compile.hpp
@@ -24,7 +24,8 @@ namespace clblast {
 // =================================================================================================
 
 // Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
+std::shared_ptr<Program> CompileFromSource(
+                          const std::string &source_string, const Precision precision,
                           const std::string &routine_name,
                           const Device& device, const Context& context,
                           std::vector<std::string>& options,
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 2d2cd62e..a29e531a 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -52,6 +52,9 @@ const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";
 // Catched an unknown error
 constexpr auto kUnknownError = -999;
 
+// Canary size to add to buffers to check for buffer overflows
+constexpr auto kCanarySize = 127;
+
 // =================================================================================================
 
 // The routine-specific arguments in string form
diff --git a/test/correctness/testblas.cpp b/test/correctness/testblas.cpp
index aa4b4785..3c92565e 100644
--- a/test/correctness/testblas.cpp
+++ b/test/correctness/testblas.cpp
@@ -66,14 +66,14 @@ TestBlas<T,U>::TestBlas(const std::vector<std::string> &arguments, const bool si
   const auto max_offset = *std::max_element(kOffsets.begin(), kOffsets.end());
   const auto max_batch_count = *std::max_element(kBatchCounts.begin(), kBatchCounts.end());
 
-  // Creates test input data
-  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
-  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset);
-  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset);
-  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset);
-  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset);
+  // Creates test input data. Adds a 'canary' region to detect buffer overflows
+  x_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize);
+  y_source_.resize(max_batch_count * std::max(max_vec, max_matvec)*max_inc + max_offset + kCanarySize);
+  a_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
+  b_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
+  c_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_ld, max_matvec) + max_offset + kCanarySize);
+  ap_source_.resize(max_batch_count * std::max(max_mat, max_matvec)*std::max(max_mat, max_matvec) + max_offset + kCanarySize);
+  scalar_source_.resize(max_batch_count * std::max(max_mat, max_matvec) + max_offset + kCanarySize);
   std::mt19937 mt(kSeed);
   std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
   PopulateVector(x_source_, mt, dist);
@@ -94,7 +94,16 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
   TestStart("regular behaviour", name);
 
   // Iterates over all the to-be-tested combinations of arguments
-  for (const auto &args: test_vector) {
+  for (auto &args: test_vector) {
+
+    // Adds a 'canary' region to detect buffer overflows
+    args.x_size += kCanarySize;
+    args.y_size += kCanarySize;
+    args.a_size += kCanarySize;
+    args.b_size += kCanarySize;
+    args.c_size += kCanarySize;
+    args.ap_size += kCanarySize;
+    args.scalar_size += kCanarySize;
 
     // Prints the current test configuration
     if (verbose_) {
@@ -209,6 +218,20 @@ void TestBlas<T,U>::TestRegular(std::vector<Arguments<U>> &test_vector, const st
         }
       }
     }
+    // Checks for differences in the 'canary' region to detect buffer overflows
+    for (auto canary_id=size_t{0}; canary_id<kCanarySize; ++canary_id) {
+      auto index = get_index_(args, get_id1_(args) - 1, get_id2_(args) - 1) + canary_id;
+      if (!TestSimilarity(result1[index], result2[index])) {
+        errors++;
+        if (verbose_) {
+          if (get_id2_(args) == 1) { std::cout << std::endl << "   Buffer overflow index " << index << ": "; }
+          else { std::cout << std::endl << "   Buffer overflow " << index << ": "; }
+          std::cout << " " << ToString(result1[index]) << " (reference) versus ";
+          std::cout << " " << ToString(result2[index]) << " (CLBlast)";
+        }
+      }
+    }
+
 
     // Report the results
     if (verbose_ && errors > 0) {
diff --git a/test/routines/level2/xhpr.hpp b/test/routines/level2/xhpr.hpp
index 1e9bbe29..3a79efa0 100644
--- a/test/routines/level2/xhpr.hpp
+++ b/test/routines/level2/xhpr.hpp
@@ -139,7 +139,7 @@ class TestXhpr {
   }
 
   // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<U> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<U> &args) { return GetSizeAP(args) - args.ap_offset; }
   static size_t ResultID2(const Arguments<U> &) { return 1; } // N/A for this routine
   static size_t GetResultIndex(const Arguments<U> &args, const size_t id1, const size_t) {
     return id1 + args.ap_offset;
diff --git a/test/routines/level2/xhpr2.hpp b/test/routines/level2/xhpr2.hpp
index 433a5a93..ed37e175 100644
--- a/test/routines/level2/xhpr2.hpp
+++ b/test/routines/level2/xhpr2.hpp
@@ -148,7 +148,7 @@ class TestXhpr2 {
   }
 
   // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
   static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
   static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
     return id1 + args.ap_offset;
diff --git a/test/routines/level2/xspr.hpp b/test/routines/level2/xspr.hpp
index af17b8cd..3f0dfe62 100644
--- a/test/routines/level2/xspr.hpp
+++ b/test/routines/level2/xspr.hpp
@@ -139,7 +139,7 @@ class TestXspr {
   }
 
   // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
   static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
   static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
     return id1 + args.ap_offset;
diff --git a/test/routines/level2/xspr2.hpp b/test/routines/level2/xspr2.hpp
index b615aca7..b91eab24 100644
--- a/test/routines/level2/xspr2.hpp
+++ b/test/routines/level2/xspr2.hpp
@@ -148,7 +148,7 @@ class TestXspr2 {
   }
 
   // Describes how to compute the indices of the result buffer
-  static size_t ResultID1(const Arguments<T> &args) { return args.ap_size - args.ap_offset; }
+  static size_t ResultID1(const Arguments<T> &args) { return GetSizeAP(args) - args.ap_offset; }
   static size_t ResultID2(const Arguments<T> &) { return 1; } // N/A for this routine
   static size_t GetResultIndex(const Arguments<T> &args, const size_t id1, const size_t) {
     return id1 + args.ap_offset;
diff --git a/test/test_utilities.cpp b/test/test_utilities.cpp
index d8c1995e..59ec949d 100644
--- a/test/test_utilities.cpp
+++ b/test/test_utilities.cpp
@@ -171,6 +171,7 @@ void GetBestParametersFromJSONFile(const std::string& file_name,
       kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '1'), kernel_family.end());
       kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '2'), kernel_family.end());
       kernel_family.erase(std::remove(kernel_family.begin(), kernel_family.end(), '3'), kernel_family.end());
+      if (kernel_family == "Xgemmdirect") { kernel_family = "XgemmDirect"; } // more kinds of mismatches
     }
 
     // Retrieves the best-parameters and sets the override
author	Cedric Nugteren <web@cedricnugteren.nl>	2018-05-19 17:54:27 +0200
committer	Cedric Nugteren <web@cedricnugteren.nl>	2018-05-19 17:54:27 +0200
commit	cbcd4ff7e8e21584a9a1f405c9f4cb979a73b718 (patch)
tree	4a131ed480dc4f496a211453f95adfebaf3f6336
parent	e057a9186a1ed0a169fcf4db7a2598d08f530834 (diff)
parent	507d7bc729eff888dd499e937bf1a636cbdee75b (diff)