summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-05-19 17:54:27 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-05-19 17:54:27 +0200
commitcbcd4ff7e8e21584a9a1f405c9f4cb979a73b718 (patch)
tree4a131ed480dc4f496a211453f95adfebaf3f6336 /src
parente057a9186a1ed0a169fcf4db7a2598d08f530834 (diff)
parent507d7bc729eff888dd499e937bf1a636cbdee75b (diff)
Merge branch 'master' into CLBlast-267-convgemm
Diffstat (limited to 'src')
-rw-r--r--src/cache.cpp4
-rw-r--r--src/cache.hpp6
-rw-r--r--src/clpp11.hpp42
-rw-r--r--src/routine.cpp10
-rw-r--r--src/routine.hpp6
-rw-r--r--src/routines/common.cpp24
-rw-r--r--src/routines/common.hpp10
-rw-r--r--src/tuning/routines/xgemm.cpp18
-rw-r--r--src/tuning/tuning.cpp8
-rw-r--r--src/tuning/tuning_api.cpp8
-rw-r--r--src/utilities/compile.cpp11
-rw-r--r--src/utilities/compile.hpp3
-rw-r--r--src/utilities/utilities.hpp3
13 files changed, 84 insertions, 69 deletions
diff --git a/src/cache.cpp b/src/cache.cpp
index 4b74b0a1..e15a72a5 100644
--- a/src/cache.cpp
+++ b/src/cache.cpp
@@ -117,8 +117,8 @@ template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const;
// =================================================================================================
-template class Cache<ProgramKey, Program>;
-template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+template class Cache<ProgramKey, std::shared_ptr<Program>>;
+template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
template void ProgramCache::RemoveBySubset<1, 2>(const ProgramKey &); // precision and routine name
// =================================================================================================
diff --git a/src/cache.hpp b/src/cache.hpp
index 228fbccb..89973f61 100644
--- a/src/cache.hpp
+++ b/src/cache.hpp
@@ -83,10 +83,10 @@ extern template std::string BinaryCache::Get(const BinaryKeyRef &, bool *) const
typedef std::tuple<RawContext, RawDeviceID, Precision, std::string> ProgramKey;
typedef std::tuple<const RawContext &, const RawDeviceID &, const Precision &, const std::string &> ProgramKeyRef;
-typedef Cache<ProgramKey, Program> ProgramCache;
+typedef Cache<ProgramKey, std::shared_ptr<Program>> ProgramCache;
-extern template class Cache<ProgramKey, Program>;
-extern template Program ProgramCache::Get(const ProgramKeyRef &, bool *) const;
+extern template class Cache<ProgramKey, std::shared_ptr<Program>>;
+extern template std::shared_ptr<Program> ProgramCache::Get(const ProgramKeyRef &, bool *) const;
// =================================================================================================
diff --git a/src/clpp11.hpp b/src/clpp11.hpp
index c4b721b9..ce6f39cb 100644
--- a/src/clpp11.hpp
+++ b/src/clpp11.hpp
@@ -437,47 +437,41 @@ using ContextPointer = cl_context*;
// C++11 version of 'cl_program'.
class Program {
public:
- Program() = default;
// Source-based constructor with memory management
- explicit Program(const Context &context, const std::string &source):
- program_(new cl_program, [](cl_program* p) {
- #ifndef _MSC_VER // 'clReleaseProgram' caused an access violation with Visual Studio
- if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
- #endif
- delete p;
- }) {
+ explicit Program(const Context &context, const std::string &source) {
const char *source_ptr = &source[0];
const auto length = source.length();
auto status = CL_SUCCESS;
- *program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
+ program_ = clCreateProgramWithSource(context(), 1, &source_ptr, &length, &status);
CLCudaAPIError::Check(status, "clCreateProgramWithSource");
}
// Binary-based constructor with memory management
- explicit Program(const Device &device, const Context &context, const std::string &binary):
- program_(new cl_program, [](cl_program* p) {
- if (*p) { CheckErrorDtor(clReleaseProgram(*p)); }
- delete p;
- }) {
+ explicit Program(const Device &device, const Context &context, const std::string &binary) {
const char *binary_ptr = &binary[0];
const auto length = binary.length();
auto status1 = CL_SUCCESS;
auto status2 = CL_SUCCESS;
const auto dev = device();
- *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
+ program_ = clCreateProgramWithBinary(context(), 1, &dev, &length,
reinterpret_cast<const unsigned char**>(&binary_ptr),
&status1, &status2);
CLCudaAPIError::Check(status1, "clCreateProgramWithBinary (binary status)");
CLCudaAPIError::Check(status2, "clCreateProgramWithBinary");
}
+ // Clean-up
+ ~Program() {
+ if (program_) { CheckErrorDtor(clReleaseProgram(program_)); }
+ }
+
// Compiles the device program and checks whether or not there are any warnings/errors
void Build(const Device &device, std::vector<std::string> &options) {
options.push_back("-cl-std=CL1.1");
auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "});
const cl_device_id dev = device();
- CheckError(clBuildProgram(*program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
+ CheckError(clBuildProgram(program_, 1, &dev, options_string.c_str(), nullptr, nullptr));
}
// Confirms whether a certain status code is an actual compilation error or warning
@@ -489,28 +483,28 @@ class Program {
std::string GetBuildInfo(const Device &device) const {
auto bytes = size_t{0};
auto query = cl_program_build_info{CL_PROGRAM_BUILD_LOG};
- CheckError(clGetProgramBuildInfo(*program_, device(), query, 0, nullptr, &bytes));
+ CheckError(clGetProgramBuildInfo(program_, device(), query, 0, nullptr, &bytes));
auto result = std::string{};
result.resize(bytes);
- CheckError(clGetProgramBuildInfo(*program_, device(), query, bytes, &result[0], nullptr));
+ CheckError(clGetProgramBuildInfo(program_, device(), query, bytes, &result[0], nullptr));
return result;
}
// Retrieves a binary or an intermediate representation of the compiled program
std::string GetIR() const {
auto bytes = size_t{0};
- CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
+ CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr));
auto result = std::string{};
result.resize(bytes);
auto result_ptr = result.data();
- CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
+ CheckError(clGetProgramInfo(program_, CL_PROGRAM_BINARIES, sizeof(char*), &result_ptr, nullptr));
return result;
}
// Accessor to the private data-member
- const cl_program& operator()() const { return *program_; }
+ const cl_program& operator()() const { return program_; }
private:
- std::shared_ptr<cl_program> program_;
+ cl_program program_ = nullptr;
};
// =================================================================================================
@@ -757,13 +751,13 @@ class Kernel {
}
// Regular constructor with memory management
- explicit Kernel(const Program &program, const std::string &name):
+ explicit Kernel(const std::shared_ptr<Program> program, const std::string &name):
kernel_(new cl_kernel, [](cl_kernel* k) {
if (*k) { CheckErrorDtor(clReleaseKernel(*k)); }
delete k;
}) {
auto status = CL_SUCCESS;
- *kernel_ = clCreateKernel(program(), name.c_str(), &status);
+ *kernel_ = clCreateKernel(program->operator()(), name.c_str(), &status);
CLCudaAPIError::Check(status, "clCreateKernel");
}
diff --git a/src/routine.cpp b/src/routine.cpp
index fa5934f6..4caa4d7b 100644
--- a/src/routine.cpp
+++ b/src/routine.cpp
@@ -96,10 +96,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
auto binary = BinaryCache::Instance().Get(BinaryKeyRef{platform_id, precision_, routine_info, device_name },
&has_binary);
if (has_binary) {
- program_ = Program(device_, context_, binary);
- program_.Build(device_, options);
+ program_ = std::make_shared<Program>(Program(device_, context_, binary));
+ program_->Build(device_, options);
ProgramCache::Instance().Store(ProgramKey{ context_(), device_(), precision_, routine_info },
- Program{ program_ });
+ std::shared_ptr<Program>{program_});
return;
}
@@ -135,10 +135,10 @@ void Routine::InitProgram(std::initializer_list<const char *> source) {
// Store the compiled binary and program in the cache
BinaryCache::Instance().Store(BinaryKey{platform_id, precision_, routine_info, device_name},
- program_.GetIR());
+ program_->GetIR());
ProgramCache::Instance().Store(ProgramKey{context_(), device_(), precision_, routine_info},
- Program{ program_ });
+ std::shared_ptr<Program>{program_});
}
// =================================================================================================
diff --git a/src/routine.hpp b/src/routine.hpp
index 00f7b5cc..8db5e5a9 100644
--- a/src/routine.hpp
+++ b/src/routine.hpp
@@ -33,6 +33,7 @@ namespace clblast {
class Routine {
public:
+ // Initializes db_, fetching cached database or building one
static void InitDatabase(const Device &device, const std::vector<std::string> &kernel_names,
const Precision precision, const std::vector<database::DatabaseEntry> &userDatabase,
Databases &db) {
@@ -78,9 +79,6 @@ class Routine {
// Initializes program_, fetching cached program or building one
void InitProgram(std::initializer_list<const char *> source);
- // Initializes db_, fetching cached database or building one
- void InitDatabase(const std::vector<database::DatabaseEntry> &userDatabase);
-
protected:
// Non-static variable for the precision
@@ -97,7 +95,7 @@ class Routine {
const Device device_;
// Compiled program (either retrieved from cache or compiled in slow path)
- Program program_;
+ std::shared_ptr<Program> program_;
// Connection to the database for all the device-specific parameters
Databases db_;
diff --git a/src/routines/common.cpp b/src/routines/common.cpp
index a4d1f577..5b80e3f2 100644
--- a/src/routines/common.cpp
+++ b/src/routines/common.cpp
@@ -77,7 +77,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
- const Program &program, const Databases &,
+ const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
@@ -95,26 +95,26 @@ void FillMatrix(Queue &queue, const Device &device,
}
// Compiles the above function
-template void FillMatrix<half>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<half>&, const half);
-template void FillMatrix<float>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float>&, const float);
-template void FillMatrix<double>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double>&, const double);
-template void FillMatrix<float2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<float2>&, const float2);
-template void FillMatrix<double2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillMatrix<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const size_t, const Buffer<double2>&, const double2);
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
- const Program &program, const Databases &,
+ const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
@@ -131,19 +131,19 @@ void FillVector(Queue &queue, const Device &device,
}
// Compiles the above function
-template void FillVector<half>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<half>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<half>&, const half);
-template void FillVector<float>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<float>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float>&, const float);
-template void FillVector<double>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<double>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double>&, const double);
-template void FillVector<float2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<float2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<float2>&, const float2);
-template void FillVector<double2>(Queue&, const Device&, const Program&, const Databases&,
+template void FillVector<double2>(Queue&, const Device&, const std::shared_ptr<Program>, const Databases&,
EventPointer, const std::vector<Event>&, const size_t, const size_t,
const size_t, const Buffer<double2>&, const double2);
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 6cbe1e1b..b909243d 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -36,7 +36,7 @@ void RunKernel(Kernel &kernel, Queue &queue, const Device &device,
// Sets all elements of a matrix to a constant value
template <typename T>
void FillMatrix(Queue &queue, const Device &device,
- const Program &program, const Databases &,
+ const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t m, const size_t n, const size_t ld, const size_t offset,
const Buffer<T> &dest,
@@ -45,7 +45,7 @@ void FillMatrix(Queue &queue, const Device &device,
// Sets all elements of a vector to a constant value
template <typename T>
void FillVector(Queue &queue, const Device &device,
- const Program &program, const Databases &,
+ const std::shared_ptr<Program> program, const Databases &,
EventPointer event, const std::vector<Event> &waitForEvents,
const size_t n, const size_t inc, const size_t offset,
const Buffer<T> &dest,
@@ -66,7 +66,7 @@ void PadCopyTransposeMatrix(Queue &queue, const Device &device,
const size_t dest_ld, const size_t dest_offset,
const Buffer<T> &dest,
const T alpha,
- const Program &program, const bool do_pad,
+ const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const bool upper = false, const bool lower = false,
const bool diagonal_imag_zero = false) {
@@ -186,7 +186,7 @@ void PadCopyTransposeMatrixBatched(Queue &queue, const Device &device,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const Buffer<int> &dest_offsets,
const Buffer<T> &dest,
- const Program &program, const bool do_pad,
+ const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {
@@ -250,7 +250,7 @@ void PadCopyTransposeMatrixStridedBatched(Queue &queue, const Device &device,
const size_t dest_one, const size_t dest_two,
const size_t dest_ld, const size_t dest_offset,
const size_t dest_stride, const Buffer<T> &dest,
- const Program &program, const bool do_pad,
+ const std::shared_ptr<Program> program, const bool do_pad,
const bool do_transpose, const bool do_conjugate,
const size_t batch_count) {
diff --git a/src/tuning/routines/xgemm.cpp b/src/tuning/routines/xgemm.cpp
index 0721ad7c..92aab611 100644
--- a/src/tuning/routines/xgemm.cpp
+++ b/src/tuning/routines/xgemm.cpp
@@ -15,8 +15,10 @@
#include <exception>
#include <string>
#include <vector>
+#include <iostream>
#include "utilities/utilities.hpp"
+#include "test/test_utilities.hpp"
#include "tuning/routines/routine_tuner.hpp"
namespace clblast {
@@ -101,6 +103,22 @@ void TuneXgemm(int argc, char* argv[]) {
const auto context = Context(device);
auto queue = Queue(context, device);
+ // Pre-load GEMM kernel tuning results if they exist
+ printf("* The GEMM routine tuner requires already tuned kernels\n");
+ printf(" Applying tuning results from disk if they exist...\n\n");
+ const auto kernel_names = {"xgemm_1", "xgemm_direct_1", "copy", "pad", "transpose", "padtranspose"};
+ for (const auto& kernel_name : kernel_names) {
+ const auto tuner_file_name = "clblast_" + std::string{kernel_name} + "_" +
+ ToString(static_cast<int>(precision)) + ".json";
+ printf("* Looking for tuning results in the current folder: '%s'\n", tuner_file_name.c_str());
+ if (std::ifstream(tuner_file_name)) { // Checks if the file exists on disk
+ OverrideParametersFromJSONFiles({tuner_file_name}, device(), precision);
+ }
+ else {
+ printf(" Not found: assuming the kernel '%s' is already tuned\n\n", kernel_name);
+ }
+ }
+
// Run the tuners for the XGEMM routines
TuneKernelSelection<T>(platform, device, context, queue, precision, RunGemmRoutine<T>,
64, 2048, 64, 1, num_runs,
diff --git a/src/tuning/tuning.cpp b/src/tuning/tuning.cpp
index dd4a83e6..216f4b31 100644
--- a/src/tuning/tuning.cpp
+++ b/src/tuning/tuning.cpp
@@ -150,11 +150,11 @@ void Tuner(int argc, char* argv[], const int V,
const auto device_architecture = GetDeviceArchitecture(device);
const auto device_name = GetDeviceName(device);
- // Creates input buffers with random data
+ // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
const auto buffer_sizes = std::vector<size_t>{
- settings.size_x, settings.size_y,
- settings.size_a, settings.size_b, settings.size_c,
- settings.size_temp
+ settings.size_x + kCanarySize, settings.size_y + kCanarySize,
+ settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
+ settings.size_temp + kCanarySize
};
std::mt19937 mt(kSeed);
std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
diff --git a/src/tuning/tuning_api.cpp b/src/tuning/tuning_api.cpp
index f1da40c1..2eec2e2e 100644
--- a/src/tuning/tuning_api.cpp
+++ b/src/tuning/tuning_api.cpp
@@ -241,11 +241,11 @@ StatusCode TunerAPI(Queue &queue, const Arguments<T> &args, const int V,
const auto device_architecture = GetDeviceArchitecture(device);
const auto device_name = GetDeviceName(device);
- // Creates input buffers with random data
+ // Creates input buffers with random data. Adds a 'canary' region to detect buffer overflows.
const auto buffer_sizes = std::vector<size_t>{
- settings.size_x, settings.size_y,
- settings.size_a, settings.size_b, settings.size_c,
- settings.size_temp
+ settings.size_x + kCanarySize, settings.size_y + kCanarySize,
+ settings.size_a + kCanarySize, settings.size_b + kCanarySize, settings.size_c + kCanarySize,
+ settings.size_temp + kCanarySize
};
const auto seed = static_cast<unsigned long>(time(nullptr));
std::mt19937 mt(seed);
diff --git a/src/utilities/compile.cpp b/src/utilities/compile.cpp
index 65131cca..05c29944 100644
--- a/src/utilities/compile.cpp
+++ b/src/utilities/compile.cpp
@@ -21,7 +21,8 @@ namespace clblast {
// =================================================================================================
// Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
+std::shared_ptr<Program> CompileFromSource(
+ const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
@@ -93,13 +94,13 @@ Program CompileFromSource(const std::string &source_string, const Precision prec
}
// Compiles the kernel
- auto program = Program(context, kernel_string);
+ auto program = std::make_shared<Program>(context, kernel_string);
try {
- program.Build(device, options);
+ program->Build(device, options);
} catch (const CLCudaAPIBuildError &e) {
- if (program.StatusIsCompilationWarningOrError(e.status()) && !silent) {
+ if (program->StatusIsCompilationWarningOrError(e.status()) && !silent) {
fprintf(stdout, "OpenCL compiler error/warning:\n%s\n",
- program.GetBuildInfo(device).c_str());
+ program->GetBuildInfo(device).c_str());
}
throw;
}
diff --git a/src/utilities/compile.hpp b/src/utilities/compile.hpp
index 1b4f4a7a..13e8c363 100644
--- a/src/utilities/compile.hpp
+++ b/src/utilities/compile.hpp
@@ -24,7 +24,8 @@ namespace clblast {
// =================================================================================================
// Compiles a program from source code
-Program CompileFromSource(const std::string &source_string, const Precision precision,
+std::shared_ptr<Program> CompileFromSource(
+ const std::string &source_string, const Precision precision,
const std::string &routine_name,
const Device& device, const Context& context,
std::vector<std::string>& options,
diff --git a/src/utilities/utilities.hpp b/src/utilities/utilities.hpp
index 2d2cd62e..a29e531a 100644
--- a/src/utilities/utilities.hpp
+++ b/src/utilities/utilities.hpp
@@ -52,6 +52,9 @@ const std::string kKhronosIntelSubgroups = "cl_intel_subgroups";
// Catched an unknown error
constexpr auto kUnknownError = -999;
+// Canary size to add to buffers to check for buffer overflows
+constexpr auto kCanarySize = 127;
+
// =================================================================================================
// The routine-specific arguments in string form