diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/clblast.cc | 78 | ||||
-rw-r--r-- | src/database.cc | 18 | ||||
-rw-r--r-- | src/routine.cc | 98 | ||||
-rw-r--r-- | src/routines/level1/xaxpy.cc | 8 | ||||
-rw-r--r-- | src/routines/level2/xgemv.cc | 10 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cc | 16 | ||||
-rw-r--r-- | src/routines/level3/xhemm.cc | 10 | ||||
-rw-r--r-- | src/routines/level3/xher2k.cc | 20 | ||||
-rw-r--r-- | src/routines/level3/xherk.cc | 14 | ||||
-rw-r--r-- | src/routines/level3/xsymm.cc | 10 | ||||
-rw-r--r-- | src/routines/level3/xsyr2k.cc | 16 | ||||
-rw-r--r-- | src/routines/level3/xsyrk.cc | 12 | ||||
-rw-r--r-- | src/routines/level3/xtrmm.cc | 8 |
13 files changed, 165 insertions, 153 deletions
diff --git a/src/clblast.cc b/src/clblast.cc index 6cb4086e..eddb8022 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -43,7 +43,7 @@ StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xaxpy<T>(queue_cpp, event_cpp); @@ -53,8 +53,8 @@ StatusCode Axpy(const size_t n, const T alpha, // Runs the routine return routine.DoAxpy(n, alpha, - Buffer(x_buffer), x_offset, x_inc, - Buffer(y_buffer), y_offset, y_inc); + Buffer<T>(x_buffer), x_offset, x_inc, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Axpy<float>(const size_t, const float, const cl_mem, const size_t, const size_t, @@ -85,7 +85,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemv<T>(queue_cpp, event_cpp); @@ -95,9 +95,9 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose, // Runs the routine return routine.DoGemv(layout, a_transpose, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(x_buffer), x_offset, x_inc, beta, - Buffer(y_buffer), y_offset, y_inc); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(x_buffer), x_offset, x_inc, beta, + Buffer<T>(y_buffer), y_offset, y_inc); } template StatusCode Gemv<float>(const Layout, const Transpose, const size_t, const size_t, const float, @@ -135,7 +135,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xgemm<T>(queue_cpp, event_cpp); @@ -145,9 +145,9 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos // Runs the routine return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, beta, + Buffer<T>(c_buffer), c_offset, c_ld); } template StatusCode Gemm<float>(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, @@ -184,7 +184,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsymm<T>(queue_cpp, event_cpp); @@ -194,9 +194,9 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoSymm(layout, side, triangle, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, beta, + Buffer<T>(c_buffer), c_offset, c_ld); } template StatusCode Symm<float>(const Layout, const Side, const Triangle, const size_t, const size_t, const float, @@ -233,7 +233,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xhemm<T>(queue_cpp, event_cpp); @@ -243,9 +243,9 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoHemm(layout, side, triangle, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, beta, + Buffer<T>(c_buffer), c_offset, c_ld); } template StatusCode Hemm<float2>(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, @@ -269,7 +269,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyrk<T>(queue_cpp, event_cpp); @@ -279,8 +279,8 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_ // Runs the routine return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<T>(a_buffer), a_offset, a_ld, beta, + Buffer<T>(c_buffer), c_offset, c_ld); } template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, @@ -312,7 +312,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp); @@ -322,8 +322,8 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_ // Runs the routine return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<std::complex<T>>(a_buffer), a_offset, a_ld, beta, + Buffer<std::complex<T>>(c_buffer), c_offset, c_ld); } template StatusCode Herk<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, @@ -346,7 +346,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xsyr2k<T>(queue_cpp, event_cpp); @@ -356,9 +356,9 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a // Runs the routine return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, beta, + Buffer<T>(c_buffer), c_offset, c_ld); } template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, @@ -395,7 +395,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xher2k<T,U>(queue_cpp, event_cpp); @@ -405,9 +405,9 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a // Runs the routine return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld, beta, - Buffer(c_buffer), c_offset, c_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld, beta, + Buffer<T>(c_buffer), c_offset, c_ld); } template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, @@ -433,7 +433,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xtrmm<T>(queue_cpp, event_cpp); @@ -443,8 +443,8 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); } template StatusCode Trmm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, @@ -483,7 +483,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { - auto queue_cpp = CommandQueue(*queue); + auto queue_cpp = Queue(*queue); auto event_cpp = Event(*event); auto routine = Xtrsm<T>(queue_cpp, event_cpp); @@ -493,8 +493,8 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, // Runs the routine return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - Buffer(a_buffer), a_offset, a_ld, - Buffer(b_buffer), b_offset, b_ld); + Buffer<T>(a_buffer), a_offset, a_ld, + Buffer<T>(b_buffer), b_offset, b_ld); } template StatusCode Trsm<float>(const Layout, const Side, const Triangle, const Transpose, const Diagonal, diff --git a/src/database.cc b/src/database.cc index 4d9d844e..258d861e 100644 --- a/src/database.cc +++ b/src/database.cc @@ -39,7 +39,7 @@ const std::vector<Database::DatabaseEntry> Database::database = { // ================================================================================================= // Constructor, computing device properties and populating the parameter-vector from the database -Database::Database(const CommandQueue &queue, const std::vector<std::string> &kernels, +Database::Database(const Queue &queue, const std::vector<std::string> &kernels, const Precision precision): parameters_{} { @@ -71,7 +71,7 @@ std::string Database::GetDefines() const { // Searches the database for the right kernel and precision Database::Parameters Database::Search(const std::string &this_kernel, - const cl_device_type this_type, + const std::string &this_type, const std::string &this_vendor, const std::string &this_device, const Precision this_precision) const { @@ -81,13 +81,13 @@ Database::Parameters Database::Search(const std::string &this_kernel, // Searches for the right vendor and device type, or selects the default if unavailable. This // assumes that the default vendor / device type is last in the database. for (auto &vendor: db.vendors) { - if (VendorEqual(vendor.name, this_vendor) && - (vendor.type == this_type || vendor.type == CL_DEVICE_TYPE_ALL)) { + if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) && + (vendor.type == this_type || vendor.type == kDeviceTypeAll)) { // Searches for the right device. If the current device is unavailable, selects the vendor // default parameters. This assumes the default is last in the database. for (auto &device: vendor.devices) { - if (device.name == this_device || device.name == kDefault) { + if (device.name == this_device || device.name == kDefaultDevice) { // Sets the parameters accordingly return device.parameters; @@ -102,13 +102,5 @@ Database::Parameters Database::Search(const std::string &this_kernel, throw std::runtime_error("Database error, could not find a suitable entry"); } -// Determines the equality between two vendor names. This is implemented because vendor names can -// be ambigious and might change between different SDK or driver versions. -bool Database::VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const { - if (db_vendor == kDefault) { return true; } - if (db_vendor == cl_vendor) { return true; } - return false; -} - // ================================================================================================= } // namespace clblast diff --git a/src/routine.cc b/src/routine.cc index aded1a31..31476c42 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -13,17 +13,17 @@ #include "internal/routine.h" -#include "internal/utilities.h" - namespace clblast { // ================================================================================================= // The cache of compiled OpenCL programs -std::vector<Routine::ProgramCache> Routine::program_cache_; +template <typename T> +std::vector<typename Routine<T>::ProgramCache> Routine<T>::program_cache_; // Constructor: not much here, because no status codes can be returned -Routine::Routine(CommandQueue &queue, Event &event, const std::string &name, - const std::vector<std::string> &routines, const Precision precision): +template <typename T> +Routine<T>::Routine(Queue &queue, Event &event, const std::string &name, + const std::vector<std::string> &routines, const Precision precision): precision_(precision), routine_name_(name), queue_(queue), @@ -40,14 +40,15 @@ Routine::Routine(CommandQueue &queue, Event &event, const std::string &name, // ================================================================================================= // Separate set-up function to allow for status codes to be returned -StatusCode Routine::SetUp() { +template <typename T> +StatusCode Routine<T>::SetUp() { // Queries the cache to see whether or not the compiled kernel is already there. If not, it will // be built and added to the cache. if (!ProgramIsInCache()) { // Inspects whether or not cl_khr_fp64 is supported in case of double precision - auto extensions = device_.Extensions(); + auto extensions = device_.Capabilities(); if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { return StatusCode::kNoDoublePrecision; @@ -85,16 +86,16 @@ StatusCode Routine::SetUp() { // Compiles the kernel try { auto program = Program(context_, source_string); - auto options = std::string{}; - auto status = program.Build(device_, options); + auto options = std::vector<std::string>(); + auto build_status = program.Build(device_, options); // Checks for compiler crashes/errors/warnings - if (status == CL_BUILD_PROGRAM_FAILURE) { + if (build_status == BuildStatus::kError) { auto message = program.GetBuildInfo(device_); fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); return StatusCode::kBuildProgramFailure; } - if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; } + if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; } // Store the compiled program in the cache program_cache_.push_back({program, device_name_, precision_, routine_name_}); @@ -108,8 +109,9 @@ StatusCode Routine::SetUp() { // ================================================================================================= // Enqueues a kernel, waits for completion, and checks for errors -StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global, - const std::vector<size_t> &local) { +template <typename T> +StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, + const std::vector<size_t> &local) { // Tests for validity of the local thread sizes if (local.size() > max_work_item_dimensions_) { @@ -132,12 +134,14 @@ StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global, if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } // Launches the kernel (and checks for launch errors) - auto status = queue_.EnqueueKernel(kernel, global, local, event_); - if (status != CL_SUCCESS) { return StatusCode::kKernelLaunchError; } + try { + kernel.Launch(queue_, global, local, event_); + } catch (...) { return StatusCode::kKernelLaunchError; } // Waits for completion of the kernel - status = event_.Wait(); - if (status != CL_SUCCESS) { return StatusCode::kKernelRunError; } + try { + queue_.Finish(event_); + } catch (...) { return StatusCode::kKernelRunError; } // No errors, normal termination of this function return StatusCode::kSuccess; @@ -147,8 +151,9 @@ StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global, // Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a // sufficient buffer size. -StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld, const size_t data_size) { +template <typename T> +StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer, + const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimA; } try { auto required_size = (ld*two + offset)*data_size; @@ -160,8 +165,9 @@ StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer // Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a // sufficient buffer size. -StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld, const size_t data_size) { +template <typename T> +StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer, + const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimB; } try { auto required_size = (ld*two + offset)*data_size; @@ -173,8 +179,9 @@ StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer // Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a // sufficient buffer size. -StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer, - const size_t offset, const size_t ld, const size_t data_size) { +template <typename T> +StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer, + const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimC; } try { auto required_size = (ld*two + offset)*data_size; @@ -188,8 +195,9 @@ StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer // Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a // sufficient buffer size. -StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset, - const size_t inc, const size_t data_size) { +template <typename T> +StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset, + const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementX; } try { auto required_size = (n*inc + offset)*data_size; @@ -201,8 +209,9 @@ StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size // Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a // sufficient buffer size. -StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset, - const size_t inc, const size_t data_size) { +template <typename T> +StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset, + const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementY; } try { auto required_size = (n*inc + offset)*data_size; @@ -215,16 +224,17 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size // ================================================================================================= // Copies or transposes a matrix and pads/unpads it with zeros -StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, - const size_t src_ld, const size_t src_offset, - const Buffer &src, - const size_t dest_one, const size_t dest_two, - const size_t dest_ld, const size_t dest_offset, - const Buffer &dest, - const Program &program, const bool do_pad, - const bool do_transpose, const bool do_conjugate, - const bool upper, const bool lower, - const bool diagonal_imag_zero) { +template <typename T> +StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two, + const size_t src_ld, const size_t src_offset, + const Buffer<T> &src, + const size_t dest_one, const size_t dest_two, + const size_t dest_ld, const size_t dest_offset, + const Buffer<T> &dest, + const Program &program, const bool do_pad, + const bool do_transpose, const bool do_conjugate, + const bool upper, const bool lower, + const bool diagonal_imag_zero) { // Determines whether or not the fast-version could potentially be used auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) && @@ -328,7 +338,8 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr // Queries the cache and retrieves a matching program. Assumes that the match is available, throws // otherwise. -const Program& Routine::GetProgramFromCache() const { +template <typename T> +const Program& Routine<T>::GetProgramFromCache() const { for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return cached_program.program; @@ -338,7 +349,8 @@ const Program& Routine::GetProgramFromCache() const { } // Queries the cache to see whether or not the compiled kernel is already there -bool Routine::ProgramIsInCache() const { +template <typename T> +bool Routine<T>::ProgramIsInCache() const { for (auto &cached_program: program_cache_) { if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; } } @@ -346,4 +358,12 @@ bool Routine::ProgramIsInCache() const { } // ================================================================================================= + +// Compiles the templated class +template class Routine<float>; +template class Routine<double>; +template class Routine<float2>; +template class Routine<double2>; + +// ================================================================================================= } // namespace clblast diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc index e6b320d9..7646b0e4 100644 --- a/src/routines/level1/xaxpy.cc +++ b/src/routines/level1/xaxpy.cc @@ -29,8 +29,8 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event): - Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) { +Xaxpy<T>::Xaxpy(Queue &queue, Event &event): + Routine<T>(queue, event, "AXPY", {"Xaxpy"}, precision_) { source_string_ = #include "../../kernels/xaxpy.opencl" ; @@ -41,8 +41,8 @@ Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event): // The main routine template <typename T> StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (n == 0) { return StatusCode::kInvalidDimension; } diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc index a7052af8..75219b63 100644 --- a/src/routines/level2/xgemv.cc +++ b/src/routines/level2/xgemv.cc @@ -29,8 +29,8 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xgemv<T>::Xgemv(CommandQueue &queue, Event &event): - Routine(queue, event, "GEMV", {"Xgemv"}, precision_) { +Xgemv<T>::Xgemv(Queue &queue, Event &event): + Routine<T>(queue, event, "GEMV", {"Xgemv"}, precision_) { source_string_ = #include "../../kernels/xgemv.opencl" ; @@ -43,10 +43,10 @@ template <typename T> StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc, const T beta, - const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) { + const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) { // Makes sure all dimensions are larger than zero if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index 85524891..525a82e6 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -29,8 +29,8 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xgemm<T>::Xgemm(CommandQueue &queue, Event &event): - Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xgemm<T>::Xgemm(Queue &queue, Event &event): + Routine<T>(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -48,10 +48,10 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; } @@ -117,9 +117,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, c_do_transpose == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T)); - auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc index bc257c44..a1c0c7c1 100644 --- a/src/routines/level3/xhemm.cc +++ b/src/routines/level3/xhemm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xhemm<T>::Xhemm(CommandQueue &queue, Event &event): +Xhemm<T>::Xhemm(Queue &queue, Event &event): Xgemm<T>(queue, event) { } @@ -32,10 +32,10 @@ template <typename T> StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } @@ -56,7 +56,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle // Temporary buffer for a copy of the hermitian matrix try { - auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + auto temp_herm = Buffer<T>(context_, k*k); // Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm // routine afterwards diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc index fa42733f..29b2f733 100644 --- a/src/routines/level3/xher2k.cc +++ b/src/routines/level3/xher2k.cc @@ -27,8 +27,8 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom // Constructor: forwards to base class constructor template <typename T, typename U> -Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event): - Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xher2k<T,U>::Xher2k(Queue &queue, Event &event): + Routine<T>(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -45,10 +45,10 @@ template <typename T, typename U> StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -105,11 +105,11 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co ab_rotated == false && ab_conjugate == true; // Creates the temporary matrices - auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc index ae350050..5174e9ab 100644 --- a/src/routines/level3/xherk.cc +++ b/src/routines/level3/xherk.cc @@ -27,8 +27,8 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp // Constructor: forwards to base class constructor template <typename T, typename U> -Xherk<T,U>::Xherk(CommandQueue &queue, Event &event): - Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xherk<T,U>::Xherk(Queue &queue, Event &event): + Routine<T>(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -45,9 +45,9 @@ template <typename T, typename U> StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const U alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const U beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -98,9 +98,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons a_rotated == false && b_conjugate == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc index 1d17f0eb..37c08d3b 100644 --- a/src/routines/level3/xsymm.cc +++ b/src/routines/level3/xsymm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xsymm<T>::Xsymm(CommandQueue &queue, Event &event): +Xsymm<T>::Xsymm(Queue &queue, Event &event): Xgemm<T>(queue, event) { } @@ -32,10 +32,10 @@ template <typename T> StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; } @@ -56,7 +56,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle // Temporary buffer for a copy of the symmetric matrix try { - auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + auto temp_symm = Buffer<T>(context_, k*k); // Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm // routine afterwards diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc index 7ab3430a..b36e7c5e 100644 --- a/src/routines/level3/xsyr2k.cc +++ b/src/routines/level3/xsyr2k.cc @@ -29,8 +29,8 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou // Constructor: forwards to base class constructor template <typename T> -Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event): - Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event): + Routine<T>(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -47,10 +47,10 @@ template <typename T> StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -99,9 +99,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons ab_rotated == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc index c6feb5e6..e4668216 100644 --- a/src/routines/level3/xsyrk.cc +++ b/src/routines/level3/xsyrk.cc @@ -29,8 +29,8 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub // Constructor: forwards to base class constructor template <typename T> -Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event): - Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { +Xsyrk<T>::Xsyrk(Queue &queue, Event &event): + Routine<T>(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) { source_string_ = #include "../../kernels/copy.opencl" #include "../../kernels/pad.opencl" @@ -47,9 +47,9 @@ template <typename T> StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, const T beta, - const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) { + const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) { // Makes sure all dimensions are larger than zero if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; } @@ -93,8 +93,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const a_rotated == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T)); - auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T)); + auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled); // Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc index 52f272e3..8be7d950 100644 --- a/src/routines/level3/xtrmm.cc +++ b/src/routines/level3/xtrmm.cc @@ -21,7 +21,7 @@ namespace clblast { // Constructor: forwards to base class constructor template <typename T> -Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event): +Xtrmm<T>::Xtrmm(Queue &queue, Event &event): Xgemm<T>(queue, event) { } @@ -33,8 +33,8 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, - const Buffer &a_buffer, const size_t a_offset, const size_t a_ld, - const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) { + const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) { // Makes sure all dimensions are larger than zero if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; } @@ -58,7 +58,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle // Temporary buffer for a copy of the triangular matrix try { - auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T)); + auto temp_triangular = Buffer<T>(context_, k*k); // Creates a general matrix from the triangular matrix to be able to run the regular Xgemm // routine afterwards |