// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements the Routine base class (see the header for information about the class). // // ================================================================================================= #include "internal/routine.h" #include "internal/utilities.h" namespace clblast { // ================================================================================================= // The cache of compiled OpenCL programs std::vector Routine::program_cache_; // Constructor: not much here, because no status codes can be returned Routine::Routine(CommandQueue &queue, Event &event, const std::vector &routines, const Precision precision): precision_(precision), queue_(queue), event_(event), context_(queue_.GetContext()), device_(queue_.GetDevice()), device_name_(device_.Name()), max_work_item_dimensions_(device_.MaxWorkItemDimensions()), max_work_item_sizes_(device_.MaxWorkItemSizes()), max_work_group_size_(device_.MaxWorkGroupSize()), db_(queue_, routines, precision_), routines_(routines) { } // ================================================================================================= // Separate set-up function to allow for status codes to be returned StatusCode Routine::SetUp(const std::string &routine_source) { // Queries the cache to see whether or not the compiled kernel is already there. If not, it will // be built and added to the cache. if (!ProgramIsInCache()) { // Inspects whether or not cl_khr_fp64 is supported in case of double precision auto extensions = device_.Extensions(); if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { return StatusCode::kNoDoublePrecision; } } // As above, but for cl_khr_fp16 (half precision) if (precision_ == Precision::kHalf) { if (extensions.find(kKhronosHalfPrecision) == std::string::npos) { return StatusCode::kNoHalfPrecision; } } // Loads the common header (typedefs and defines and such) std::string common_header = #include "kernels/common.opencl" // Collects the parameters for this device in the form of defines, and adds the precision auto defines = db_.GetDefines(); defines += "#define PRECISION "+ToString(static_cast(precision_))+"\n"; auto source_string = defines + common_header + routine_source; // Compiles the kernel try { auto program = Program(context_, source_string); auto options = std::string{}; auto status = program.Build(device_, options); // Checks for compiler crashes/errors/warnings if (status == CL_BUILD_PROGRAM_FAILURE) { auto message = program.GetBuildInfo(device_); fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); return StatusCode::kBuildProgramFailure; } if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; } // Store the compiled program in the cache program_cache_.push_back({program, device_name_, precision_, routines_}); } catch (...) { return StatusCode::kBuildProgramFailure; } } // No errors, normal termination of this function return StatusCode::kSuccess; } // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors StatusCode Routine::RunKernel(const Kernel &kernel, std::vector &global, const std::vector &local) { // Tests for validity of the local thread sizes if (local.size() > max_work_item_dimensions_) { return StatusCode::kInvalidLocalNumDimensions; } for (auto i=size_t{0}; i max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; } } auto local_size = size_t{1}; for (auto &item: local) { local_size *= item; } if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; } // Make sure the global thread sizes are at least equal to the local sizes for (auto i=size_t{0}; i(src_ld)); kernel.SetArgument(1, src()); kernel.SetArgument(2, dest()); } else { kernel.SetArgument(0, static_cast(src_one)); kernel.SetArgument(1, static_cast(src_two)); kernel.SetArgument(2, static_cast(src_ld)); kernel.SetArgument(3, static_cast(src_offset)); kernel.SetArgument(4, src()); kernel.SetArgument(5, static_cast(dest_one)); kernel.SetArgument(6, static_cast(dest_two)); kernel.SetArgument(7, static_cast(dest_ld)); kernel.SetArgument(8, static_cast(dest_offset)); kernel.SetArgument(9, dest()); if (pad) { kernel.SetArgument(10, static_cast(do_conjugate)); } else { kernel.SetArgument(10, static_cast(upper)); kernel.SetArgument(11, static_cast(lower)); } } // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. auto status = StatusCode::kSuccess; if (do_transpose) { if (use_fast_kernel) { auto global = std::vector{dest_one / db_["TRA_WPT"], dest_two / db_["TRA_WPT"]}; auto local = std::vector{db_["TRA_DIM"], db_["TRA_DIM"]}; status = RunKernel(kernel, global, local); } else { auto global = std::vector{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])}; auto local = std::vector{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; status = RunKernel(kernel, global, local); } } else { if (use_fast_kernel) { auto global = std::vector{dest_one / db_["COPY_VW"], dest_two / db_["COPY_WPT"]}; auto local = std::vector{db_["COPY_DIMX"], db_["COPY_DIMY"]}; status = RunKernel(kernel, global, local); } else { auto global = std::vector{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; auto local = std::vector{db_["PAD_DIMX"], db_["PAD_DIMY"]}; status = RunKernel(kernel, global, local); } } return status; } catch (...) { return StatusCode::kInvalidKernel; } } // ================================================================================================= // Queries the cache and retrieves a matching program. Assumes that the match is available, throws // otherwise. const Program& Routine::GetProgramFromCache() const { for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return cached_program.program; } } throw std::runtime_error("Internal CLBlast error: Expected program in cache, but found none."); } // Queries the cache to see whether or not the compiled kernel is already there bool Routine::ProgramIsInCache() const { for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routines_)) { return true; } } return false; } // ================================================================================================= } // namespace clblast