diff options
30 files changed, 105 insertions, 83 deletions
@@ -5,7 +5,7 @@ Development version (next release) - Performance and correctness tests can now (on top of clBLAS) be performed against CPU BLAS libraries - Fixed the use of events within the library - Changed the enum parameters to match the raw values of the cblas standard -- Added a function to clear the cache of previously compiled programs +- Fixed the cache of previously compiled binaries and added a function to clear it - Added level-1 routines: * SNRM2/DNRM2/ScNRM2/DzNRM2 * SASUM/DASUM/ScASUM/DzASUM diff --git a/include/clblast.h b/include/clblast.h index 57fca119..e473adbe 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -556,7 +556,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, c // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. -StatusCode ClearCompiledProgramCache(); +StatusCode ClearCache(); // ================================================================================================= diff --git a/include/clblast_c.h b/include/clblast_c.h index e23f0305..45e50cff 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -1074,7 +1074,7 @@ StatusCode PUBLIC_API CLBlastZtrsm(const Layout layout, const Side side, const T // CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on // for the same device. This cache can be cleared to free up system memory or in case of debugging. -StatusCode PUBLIC_API CLBlastClearCompiledProgramCache(); +StatusCode PUBLIC_API CLBlastClearCache(); // ================================================================================================= diff --git a/include/internal/cache.h b/include/internal/cache.h index 44fad68d..fa33b78f 100644 --- a/include/internal/cache.h +++ b/include/internal/cache.h @@ -24,9 +24,9 @@ namespace clblast { namespace cache { // ================================================================================================= -// The cache of compiled OpenCL programs, along with some meta-data -struct ProgramCache { - Program program; +// The cache of compiled OpenCL binaries, along with some meta-data +struct BinaryCache { + std::string binary; std::string device_name; Precision precision; std::string routine_name_; @@ -41,28 +41,28 @@ struct ProgramCache { }; // The actual cache, implemented as a vector of the above data-type, and its mutex -static std::vector<ProgramCache> program_cache_; -static std::mutex program_cache_mutex_; +static std::vector<BinaryCache> binary_cache_; +static std::mutex binary_cache_mutex_; // ================================================================================================= -// Stores the compiled program in the cache -void StoreProgramToCache(const Program& program, const std::string &device_name, - const Precision &precision, const std::string &routine_name); +// Stores the compiled binary in the cache +void StoreBinaryToCache(const std::string& binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name); -// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws // otherwise. -const Program& GetProgramFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); +const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); // Queries the cache to see whether or not the compiled kernel is already there -bool ProgramIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name); +bool BinaryIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name); // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode ClearCompiledProgramCache(); +// Clears the cache of stored binaries +StatusCode ClearCache(); // ================================================================================================= } // namespace cache diff --git a/include/internal/clpp11.h b/include/internal/clpp11.h index 543d423a..b865ab1e 100644 --- a/include/internal/clpp11.h +++ b/include/internal/clpp11.h @@ -283,7 +283,7 @@ class Program { public: // Note that there is no constructor based on the regular OpenCL data-type because of extra state - // Regular constructor with memory management + // Source-based constructor with memory management explicit Program(const Context &context, std::string source): program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), length_(source.length()), @@ -294,6 +294,22 @@ class Program { CheckError(status); } + // Binary-based constructor with memory management + explicit Program(const Device &device, const Context &context, const std::string& binary): + program_(new cl_program, [](cl_program* p) { CheckError(clReleaseProgram(*p)); delete p; }), + length_(binary.length()), + source_(binary), + source_ptr_(&source_[0]) { + auto status1 = CL_SUCCESS; + auto status2 = CL_SUCCESS; + const cl_device_id dev = device(); + *program_ = clCreateProgramWithBinary(context(), 1, &dev, &length_, + reinterpret_cast<const unsigned char**>(&source_ptr_), + &status1, &status2); + CheckError(status1); + CheckError(status2); + } + // Compiles the device program and returns whether or not there where any warnings/errors BuildStatus Build(const Device &device, std::vector<std::string> &options) { auto options_string = std::accumulate(options.begin(), options.end(), std::string{" "}); @@ -322,7 +338,7 @@ class Program { return result; } - // Retrieves an intermediate representation of the compiled program + // Retrieves a binary or an intermediate representation of the compiled program std::string GetIR() const { auto bytes = size_t{0}; CheckError(clGetProgramInfo(*program_, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bytes, nullptr)); @@ -338,7 +354,7 @@ class Program { private: std::shared_ptr<cl_program> program_; size_t length_; - std::string source_; + std::string source_; // Note: the source can also be a binary or IR const char* source_ptr_; }; @@ -633,7 +649,8 @@ class Kernel { // Launches the kernel while waiting for other events CheckError(clEnqueueNDRangeKernel(queue(), *kernel_, static_cast<cl_uint>(global.size()), nullptr, global.data(), local.data(), - waitForEventsPlain.size(), waitForEventsPlain.data(), + static_cast<cl_uint>(waitForEventsPlain.size()), + waitForEventsPlain.data(), event)); } diff --git a/include/internal/routine.h b/include/internal/routine.h index 013769d8..32be6012 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -84,19 +84,23 @@ class Routine { const bool upper = false, const bool lower = false, const bool diagonal_imag_zero = false); - // Stores a newly compiled program into the cache - void StoreProgramToCache(const Program& program) const { - return cache::StoreProgramToCache(program, device_name_, precision_, routine_name_); + // Stores a newly compiled binary into the cache + void StoreBinaryToCache(const std::string& binary) const { + return cache::StoreBinaryToCache(binary, device_name_, precision_, routine_name_); } // Queries the cache and retrieve either a matching program or a boolean whether a match exists. // The first assumes that the program is available in the cache and will throw an exception // otherwise. - const Program& GetProgramFromCache() const { - return cache::GetProgramFromCache(device_name_, precision_, routine_name_); + Program GetProgramFromCache() const { + auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_); + auto program = Program(device_, context_, binary); + auto options = std::vector<std::string>(); + program.Build(device_, options); + return program; } bool ProgramIsInCache() const { - return cache::ProgramIsInCache(device_name_, precision_, routine_name_); + return cache::BinaryIsInCache(device_name_, precision_, routine_name_); } // Non-static variable for the precision. Note that the same variable (but static) might exist in diff --git a/src/cache.cc b/src/cache.cc index beeb1b35..18731a51 100644 --- a/src/cache.cc +++ b/src/cache.cc @@ -21,50 +21,50 @@ namespace clblast { namespace cache { // ================================================================================================= -// Stores the compiled program in the cache -void StoreProgramToCache(const Program& program, const std::string &device_name, - const Precision &precision, const std::string &routine_name) { - program_cache_mutex_.lock(); - program_cache_.push_back({program, device_name, precision, routine_name}); - program_cache_mutex_.unlock(); +// Stores the compiled binary or IR in the cache +void StoreBinaryToCache(const std::string& binary, const std::string &device_name, + const Precision &precision, const std::string &routine_name) { + binary_cache_mutex_.lock(); + binary_cache_.push_back({binary, device_name, precision, routine_name}); + binary_cache_mutex_.unlock(); } -// Queries the cache and retrieves a matching program. Assumes that the match is available, throws +// Queries the cache and retrieves a matching binary. Assumes that the match is available, throws // otherwise. -const Program& GetProgramFromCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name, precision, routine_name)) { - program_cache_mutex_.unlock(); - return cached_program.program; +const std::string& GetBinaryFromCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + binary_cache_mutex_.lock(); + for (auto &cached_binary: binary_cache_) { + if (cached_binary.MatchInCache(device_name, precision, routine_name)) { + binary_cache_mutex_.unlock(); + return cached_binary.binary; } } - program_cache_mutex_.unlock(); - throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); + binary_cache_mutex_.unlock(); + throw std::runtime_error("Internal CLBlast error: Expected binary in cache, but found none."); } // Queries the cache to see whether or not the compiled kernel is already there -bool ProgramIsInCache(const std::string &device_name, const Precision &precision, - const std::string &routine_name) { - program_cache_mutex_.lock(); - for (auto &cached_program: program_cache_) { - if (cached_program.MatchInCache(device_name, precision, routine_name)) { - program_cache_mutex_.unlock(); +bool BinaryIsInCache(const std::string &device_name, const Precision &precision, + const std::string &routine_name) { + binary_cache_mutex_.lock(); + for (auto &cached_binary: binary_cache_) { + if (cached_binary.MatchInCache(device_name, precision, routine_name)) { + binary_cache_mutex_.unlock(); return true; } } - program_cache_mutex_.unlock(); + binary_cache_mutex_.unlock(); return false; } // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode ClearCompiledProgramCache() { - program_cache_mutex_.lock(); - program_cache_.clear(); - program_cache_mutex_.unlock(); +// Clears the cache of stored binaries +StatusCode ClearCache() { + binary_cache_mutex_.lock(); + binary_cache_.clear(); + binary_cache_mutex_.unlock(); return StatusCode::kSuccess; } diff --git a/src/clblast.cc b/src/clblast.cc index fac5a539..fe79d7c1 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -1854,8 +1854,8 @@ template StatusCode PUBLIC_API Trsm<double2>(const Layout, const Side, const Tri // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode ClearCompiledProgramCache() { return cache::ClearCompiledProgramCache(); } +// Clears the cache of stored binaries +StatusCode ClearCache() { return cache::ClearCache(); } // ================================================================================================= } // namespace clblast diff --git a/src/clblast_c.cc b/src/clblast_c.cc index 72d93c4b..172bce64 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -2343,9 +2343,9 @@ StatusCode CLBlastZtrsm(const Layout layout, const Side side, const Triangle tri // ================================================================================================= -// Clears the cache of stored program binaries -StatusCode CLBlastClearCompiledProgramCache() { - return static_cast<StatusCode>(clblast::ClearCompiledProgramCache()); +// Clears the cache of stored binaries +StatusCode CLBlastClearCache() { + return static_cast<StatusCode>(clblast::ClearCache()); } // ================================================================================================= diff --git a/src/routine.cc b/src/routine.cc index e0a75e41..cd4d82fb 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -96,8 +96,9 @@ StatusCode Routine<T>::SetUp() { } if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } - // Store the compiled program in the cache (atomic for thread-safety) - StoreProgramToCache(program); + // Store the compiled kernel in the cache + auto binary = program.GetIR(); + StoreBinaryToCache(binary); } catch (...) { return StatusCode::kBuildProgramFailure; } } diff --git a/src/routines/level1/xamax.cc b/src/routines/level1/xamax.cc index ffdfa496..33bd72a6 100644 --- a/src/routines/level1/xamax.cc +++ b/src/routines/level1/xamax.cc @@ -55,7 +55,7 @@ StatusCode Xamax<T>::DoAmax(const size_t n, // Retrieves the Xamax kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xamax"); auto kernel2 = Kernel(program, "XamaxEpilogue"); diff --git a/src/routines/level1/xasum.cc b/src/routines/level1/xasum.cc index 5799e25a..ea33d7e1 100644 --- a/src/routines/level1/xasum.cc +++ b/src/routines/level1/xasum.cc @@ -55,7 +55,7 @@ StatusCode Xasum<T>::DoAsum(const size_t n, // Retrieves the Xasum kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xasum"); auto kernel2 = Kernel(program, "XasumEpilogue"); diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index 37d23543..96809a57 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -64,7 +64,7 @@ StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, // Retrieves the Xaxpy kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level1/xcopy.cc b/src/routines/level1/xcopy.cc index 04508383..d34482ce 100644 --- a/src/routines/level1/xcopy.cc +++ b/src/routines/level1/xcopy.cc @@ -64,7 +64,7 @@ StatusCode Xcopy<T>::DoCopy(const size_t n, // Retrieves the Xcopy kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level1/xdot.cc b/src/routines/level1/xdot.cc index 4813a004..b2513485 100644 --- a/src/routines/level1/xdot.cc +++ b/src/routines/level1/xdot.cc @@ -59,7 +59,7 @@ StatusCode Xdot<T>::DoDot(const size_t n, // Retrieves the Xdot kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xdot"); auto kernel2 = Kernel(program, "XdotEpilogue"); diff --git a/src/routines/level1/xnrm2.cc b/src/routines/level1/xnrm2.cc index ceabe586..86166a0c 100644 --- a/src/routines/level1/xnrm2.cc +++ b/src/routines/level1/xnrm2.cc @@ -55,7 +55,7 @@ StatusCode Xnrm2<T>::DoNrm2(const size_t n, // Retrieves the Xnrm2 kernels from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel1 = Kernel(program, "Xnrm2"); auto kernel2 = Kernel(program, "Xnrm2Epilogue"); diff --git a/src/routines/level1/xscal.cc b/src/routines/level1/xscal.cc index e83e73fd..b92e2cdf 100644 --- a/src/routines/level1/xscal.cc +++ b/src/routines/level1/xscal.cc @@ -60,7 +60,7 @@ StatusCode Xscal<T>::DoScal(const size_t n, const T alpha, // Retrieves the Xscal kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level1/xswap.cc b/src/routines/level1/xswap.cc index bc425f40..bfc4a739 100644 --- a/src/routines/level1/xswap.cc +++ b/src/routines/level1/xswap.cc @@ -64,7 +64,7 @@ StatusCode Xswap<T>::DoSwap(const size_t n, // Retrieves the Xswap kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index 24e87db0..f8985038 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -136,7 +136,7 @@ StatusCode Xgemv<T>::MatVec(const Layout layout, const Transpose a_transpose, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the kernel arguments diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index dda78232..686c7e60 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -66,7 +66,7 @@ StatusCode Xger<T>::DoGer(const Layout layout, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xger"); // Sets the kernel arguments diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index aba665b0..a7116213 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -79,7 +79,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher"); // Sets the kernel arguments diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index bcd6488f..3fd1a961 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -68,7 +68,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, // Retrieves the Xgemv kernel from the compiled binary try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, "Xher2"); // Sets the kernel arguments diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 7557dcc3..aa081e81 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -107,7 +107,7 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == m_ceiled && a_two == k_ceiled && a_ld == m_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index c0a4306a..d2fbf36e 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -61,7 +61,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the arguments for the hermitian-to-squared kernel diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index 4d5a4d35..2c2c815d 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -93,7 +93,7 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a1_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index 574debe4..414c4760 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -90,7 +90,7 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 914a326a..a39026f1 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -61,7 +61,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the arguments for the symmetric-to-squared kernel diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 44d0024e..3206c669 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -91,7 +91,7 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = ab_one == n_ceiled && ab_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index 44ed8d35..741ad064 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -87,7 +87,7 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const try { // Loads the program from the database - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); // Determines whether or not temporary matrices are needed auto a_no_temp = a_one == n_ceiled && a_two == k_ceiled && a_ld == n_ceiled && a_offset == 0 && diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 484cf040..9e3b27b4 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -63,7 +63,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards try { - auto& program = GetProgramFromCache(); + const auto program = GetProgramFromCache(); auto kernel = Kernel(program, kernel_name); // Sets the arguments for the triangular-to-squared kernel |