summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCNugteren <web@cedricnugteren.nl>2015-07-27 07:18:06 +0200
committerCNugteren <web@cedricnugteren.nl>2015-07-27 07:18:06 +0200
commitf7199b831f847340f0921ef2140a4e64809db037 (patch)
treed725b7e63b0662598ad4be0a4c2457820ded8ed4 /src
parentb10f4a633c4ffb3bb04d35503396ff94528df4d0 (diff)
Now using the new Claduc C++11 OpenCL header
Diffstat (limited to 'src')
-rw-r--r--src/clblast.cc78
-rw-r--r--src/database.cc18
-rw-r--r--src/routine.cc98
-rw-r--r--src/routines/level1/xaxpy.cc8
-rw-r--r--src/routines/level2/xgemv.cc10
-rw-r--r--src/routines/level3/xgemm.cc16
-rw-r--r--src/routines/level3/xhemm.cc10
-rw-r--r--src/routines/level3/xher2k.cc20
-rw-r--r--src/routines/level3/xherk.cc14
-rw-r--r--src/routines/level3/xsymm.cc10
-rw-r--r--src/routines/level3/xsyr2k.cc16
-rw-r--r--src/routines/level3/xsyrk.cc12
-rw-r--r--src/routines/level3/xtrmm.cc8
13 files changed, 165 insertions, 153 deletions
diff --git a/src/clblast.cc b/src/clblast.cc
index 6cb4086e..eddb8022 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -43,7 +43,7 @@ StatusCode Axpy(const size_t n, const T alpha,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xaxpy<T>(queue_cpp, event_cpp);
@@ -53,8 +53,8 @@ StatusCode Axpy(const size_t n, const T alpha,
// Runs the routine
return routine.DoAxpy(n, alpha,
- Buffer(x_buffer), x_offset, x_inc,
- Buffer(y_buffer), y_offset, y_inc);
+ Buffer<T>(x_buffer), x_offset, x_inc,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Axpy<float>(const size_t, const float,
const cl_mem, const size_t, const size_t,
@@ -85,7 +85,7 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xgemv<T>(queue_cpp, event_cpp);
@@ -95,9 +95,9 @@ StatusCode Gemv(const Layout layout, const Transpose a_transpose,
// Runs the routine
return routine.DoGemv(layout, a_transpose, m, n, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(x_buffer), x_offset, x_inc, beta,
- Buffer(y_buffer), y_offset, y_inc);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(x_buffer), x_offset, x_inc, beta,
+ Buffer<T>(y_buffer), y_offset, y_inc);
}
template StatusCode Gemv<float>(const Layout, const Transpose,
const size_t, const size_t, const float,
@@ -135,7 +135,7 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xgemm<T>(queue_cpp, event_cpp);
@@ -145,9 +145,9 @@ StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpos
// Runs the routine
return routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld, beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
}
template StatusCode Gemm<float>(const Layout, const Transpose, const Transpose,
const size_t, const size_t, const size_t, const float,
@@ -184,7 +184,7 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsymm<T>(queue_cpp, event_cpp);
@@ -194,9 +194,9 @@ StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
// Runs the routine
return routine.DoSymm(layout, side, triangle, m, n, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld, beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
}
template StatusCode Symm<float>(const Layout, const Side, const Triangle,
const size_t, const size_t, const float,
@@ -233,7 +233,7 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xhemm<T>(queue_cpp, event_cpp);
@@ -243,9 +243,9 @@ StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
// Runs the routine
return routine.DoHemm(layout, side, triangle, m, n, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld, beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
}
template StatusCode Hemm<float2>(const Layout, const Side, const Triangle,
const size_t, const size_t, const float2,
@@ -269,7 +269,7 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyrk<T>(queue_cpp, event_cpp);
@@ -279,8 +279,8 @@ StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_
// Runs the routine
return routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha,
- Buffer(a_buffer), a_offset, a_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld, beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
}
template StatusCode Syrk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
@@ -312,7 +312,7 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xherk<std::complex<T>,T>(queue_cpp, event_cpp);
@@ -322,8 +322,8 @@ StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_
// Runs the routine
return routine.DoHerk(layout, triangle, a_transpose, n, k, alpha,
- Buffer(a_buffer), a_offset, a_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<std::complex<T>>(a_buffer), a_offset, a_ld, beta,
+ Buffer<std::complex<T>>(c_buffer), c_offset, c_ld);
}
template StatusCode Herk<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
@@ -346,7 +346,7 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xsyr2k<T>(queue_cpp, event_cpp);
@@ -356,9 +356,9 @@ StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose a
// Runs the routine
return routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld, beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
}
template StatusCode Syr2k<float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float,
@@ -395,7 +395,7 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta,
cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xher2k<T,U>(queue_cpp, event_cpp);
@@ -405,9 +405,9 @@ StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose a
// Runs the routine
return routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld, beta,
- Buffer(c_buffer), c_offset, c_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld, beta,
+ Buffer<T>(c_buffer), c_offset, c_ld);
}
template StatusCode Her2k<float2,float>(const Layout, const Triangle, const Transpose,
const size_t, const size_t, const float2,
@@ -433,7 +433,7 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xtrmm<T>(queue_cpp, event_cpp);
@@ -443,8 +443,8 @@ StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle,
// Runs the routine
return routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
}
template StatusCode Trmm<float>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
@@ -483,7 +483,7 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
cl_command_queue* queue, cl_event* event) {
- auto queue_cpp = CommandQueue(*queue);
+ auto queue_cpp = Queue(*queue);
auto event_cpp = Event(*event);
auto routine = Xtrsm<T>(queue_cpp, event_cpp);
@@ -493,8 +493,8 @@ StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle,
// Runs the routine
return routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha,
- Buffer(a_buffer), a_offset, a_ld,
- Buffer(b_buffer), b_offset, b_ld);
+ Buffer<T>(a_buffer), a_offset, a_ld,
+ Buffer<T>(b_buffer), b_offset, b_ld);
}
template StatusCode Trsm<float>(const Layout, const Side, const Triangle,
const Transpose, const Diagonal,
diff --git a/src/database.cc b/src/database.cc
index 4d9d844e..258d861e 100644
--- a/src/database.cc
+++ b/src/database.cc
@@ -39,7 +39,7 @@ const std::vector<Database::DatabaseEntry> Database::database = {
// =================================================================================================
// Constructor, computing device properties and populating the parameter-vector from the database
-Database::Database(const CommandQueue &queue, const std::vector<std::string> &kernels,
+Database::Database(const Queue &queue, const std::vector<std::string> &kernels,
const Precision precision):
parameters_{} {
@@ -71,7 +71,7 @@ std::string Database::GetDefines() const {
// Searches the database for the right kernel and precision
Database::Parameters Database::Search(const std::string &this_kernel,
- const cl_device_type this_type,
+ const std::string &this_type,
const std::string &this_vendor,
const std::string &this_device,
const Precision this_precision) const {
@@ -81,13 +81,13 @@ Database::Parameters Database::Search(const std::string &this_kernel,
// Searches for the right vendor and device type, or selects the default if unavailable. This
// assumes that the default vendor / device type is last in the database.
for (auto &vendor: db.vendors) {
- if (VendorEqual(vendor.name, this_vendor) &&
- (vendor.type == this_type || vendor.type == CL_DEVICE_TYPE_ALL)) {
+ if ((vendor.name == this_vendor || vendor.name == kDeviceVendorAll) &&
+ (vendor.type == this_type || vendor.type == kDeviceTypeAll)) {
// Searches for the right device. If the current device is unavailable, selects the vendor
// default parameters. This assumes the default is last in the database.
for (auto &device: vendor.devices) {
- if (device.name == this_device || device.name == kDefault) {
+ if (device.name == this_device || device.name == kDefaultDevice) {
// Sets the parameters accordingly
return device.parameters;
@@ -102,13 +102,5 @@ Database::Parameters Database::Search(const std::string &this_kernel,
throw std::runtime_error("Database error, could not find a suitable entry");
}
-// Determines the equality between two vendor names. This is implemented because vendor names can
-// be ambigious and might change between different SDK or driver versions.
-bool Database::VendorEqual(const std::string &db_vendor, const std::string &cl_vendor) const {
- if (db_vendor == kDefault) { return true; }
- if (db_vendor == cl_vendor) { return true; }
- return false;
-}
-
// =================================================================================================
} // namespace clblast
diff --git a/src/routine.cc b/src/routine.cc
index aded1a31..31476c42 100644
--- a/src/routine.cc
+++ b/src/routine.cc
@@ -13,17 +13,17 @@
#include "internal/routine.h"
-#include "internal/utilities.h"
-
namespace clblast {
// =================================================================================================
// The cache of compiled OpenCL programs
-std::vector<Routine::ProgramCache> Routine::program_cache_;
+template <typename T>
+std::vector<typename Routine<T>::ProgramCache> Routine<T>::program_cache_;
// Constructor: not much here, because no status codes can be returned
-Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
- const std::vector<std::string> &routines, const Precision precision):
+template <typename T>
+Routine<T>::Routine(Queue &queue, Event &event, const std::string &name,
+ const std::vector<std::string> &routines, const Precision precision):
precision_(precision),
routine_name_(name),
queue_(queue),
@@ -40,14 +40,15 @@ Routine::Routine(CommandQueue &queue, Event &event, const std::string &name,
// =================================================================================================
// Separate set-up function to allow for status codes to be returned
-StatusCode Routine::SetUp() {
+template <typename T>
+StatusCode Routine<T>::SetUp() {
// Queries the cache to see whether or not the compiled kernel is already there. If not, it will
// be built and added to the cache.
if (!ProgramIsInCache()) {
// Inspects whether or not cl_khr_fp64 is supported in case of double precision
- auto extensions = device_.Extensions();
+ auto extensions = device_.Capabilities();
if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
return StatusCode::kNoDoublePrecision;
@@ -85,16 +86,16 @@ StatusCode Routine::SetUp() {
// Compiles the kernel
try {
auto program = Program(context_, source_string);
- auto options = std::string{};
- auto status = program.Build(device_, options);
+ auto options = std::vector<std::string>();
+ auto build_status = program.Build(device_, options);
// Checks for compiler crashes/errors/warnings
- if (status == CL_BUILD_PROGRAM_FAILURE) {
+ if (build_status == BuildStatus::kError) {
auto message = program.GetBuildInfo(device_);
fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
return StatusCode::kBuildProgramFailure;
}
- if (status == CL_INVALID_BINARY) { return StatusCode::kInvalidBinary; }
+ if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
// Store the compiled program in the cache
program_cache_.push_back({program, device_name_, precision_, routine_name_});
@@ -108,8 +109,9 @@ StatusCode Routine::SetUp() {
// =================================================================================================
// Enqueues a kernel, waits for completion, and checks for errors
-StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global,
- const std::vector<size_t> &local) {
+template <typename T>
+StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
+ const std::vector<size_t> &local) {
// Tests for validity of the local thread sizes
if (local.size() > max_work_item_dimensions_) {
@@ -132,12 +134,14 @@ StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global,
if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
// Launches the kernel (and checks for launch errors)
- auto status = queue_.EnqueueKernel(kernel, global, local, event_);
- if (status != CL_SUCCESS) { return StatusCode::kKernelLaunchError; }
+ try {
+ kernel.Launch(queue_, global, local, event_);
+ } catch (...) { return StatusCode::kKernelLaunchError; }
// Waits for completion of the kernel
- status = event_.Wait();
- if (status != CL_SUCCESS) { return StatusCode::kKernelRunError; }
+ try {
+ queue_.Finish(event_);
+ } catch (...) { return StatusCode::kKernelRunError; }
// No errors, normal termination of this function
return StatusCode::kSuccess;
@@ -147,8 +151,9 @@ StatusCode Routine::RunKernel(const Kernel &kernel, std::vector<size_t> &global,
// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
// sufficient buffer size.
-StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer &buffer,
- const size_t offset, const size_t ld, const size_t data_size) {
+template <typename T>
+StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimA; }
try {
auto required_size = (ld*two + offset)*data_size;
@@ -160,8 +165,9 @@ StatusCode Routine::TestMatrixA(const size_t one, const size_t two, const Buffer
// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
// sufficient buffer size.
-StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer &buffer,
- const size_t offset, const size_t ld, const size_t data_size) {
+template <typename T>
+StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimB; }
try {
auto required_size = (ld*two + offset)*data_size;
@@ -173,8 +179,9 @@ StatusCode Routine::TestMatrixB(const size_t one, const size_t two, const Buffer
// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
// sufficient buffer size.
-StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer &buffer,
- const size_t offset, const size_t ld, const size_t data_size) {
+template <typename T>
+StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
+ const size_t offset, const size_t ld, const size_t data_size) {
if (ld < one) { return StatusCode::kInvalidLeadDimC; }
try {
auto required_size = (ld*two + offset)*data_size;
@@ -188,8 +195,9 @@ StatusCode Routine::TestMatrixC(const size_t one, const size_t two, const Buffer
// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
-StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size_t offset,
- const size_t inc, const size_t data_size) {
+template <typename T>
+StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
+ const size_t inc, const size_t data_size) {
if (inc == 0) { return StatusCode::kInvalidIncrementX; }
try {
auto required_size = (n*inc + offset)*data_size;
@@ -201,8 +209,9 @@ StatusCode Routine::TestVectorX(const size_t n, const Buffer &buffer, const size
// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
// sufficient buffer size.
-StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size_t offset,
- const size_t inc, const size_t data_size) {
+template <typename T>
+StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
+ const size_t inc, const size_t data_size) {
if (inc == 0) { return StatusCode::kInvalidIncrementY; }
try {
auto required_size = (n*inc + offset)*data_size;
@@ -215,16 +224,17 @@ StatusCode Routine::TestVectorY(const size_t n, const Buffer &buffer, const size
// =================================================================================================
// Copies or transposes a matrix and pads/unpads it with zeros
-StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
- const size_t src_ld, const size_t src_offset,
- const Buffer &src,
- const size_t dest_one, const size_t dest_two,
- const size_t dest_ld, const size_t dest_offset,
- const Buffer &dest,
- const Program &program, const bool do_pad,
- const bool do_transpose, const bool do_conjugate,
- const bool upper, const bool lower,
- const bool diagonal_imag_zero) {
+template <typename T>
+StatusCode Routine<T>::PadCopyTransposeMatrix(const size_t src_one, const size_t src_two,
+ const size_t src_ld, const size_t src_offset,
+ const Buffer<T> &src,
+ const size_t dest_one, const size_t dest_two,
+ const size_t dest_ld, const size_t dest_offset,
+ const Buffer<T> &dest,
+ const Program &program, const bool do_pad,
+ const bool do_transpose, const bool do_conjugate,
+ const bool upper, const bool lower,
+ const bool diagonal_imag_zero) {
// Determines whether or not the fast-version could potentially be used
auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
@@ -328,7 +338,8 @@ StatusCode Routine::PadCopyTransposeMatrix(const size_t src_one, const size_t sr
// Queries the cache and retrieves a matching program. Assumes that the match is available, throws
// otherwise.
-const Program& Routine::GetProgramFromCache() const {
+template <typename T>
+const Program& Routine<T>::GetProgramFromCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) {
return cached_program.program;
@@ -338,7 +349,8 @@ const Program& Routine::GetProgramFromCache() const {
}
// Queries the cache to see whether or not the compiled kernel is already there
-bool Routine::ProgramIsInCache() const {
+template <typename T>
+bool Routine<T>::ProgramIsInCache() const {
for (auto &cached_program: program_cache_) {
if (cached_program.MatchInCache(device_name_, precision_, routine_name_)) { return true; }
}
@@ -346,4 +358,12 @@ bool Routine::ProgramIsInCache() const {
}
// =================================================================================================
+
+// Compiles the templated class
+template class Routine<float>;
+template class Routine<double>;
+template class Routine<float2>;
+template class Routine<double2>;
+
+// =================================================================================================
} // namespace clblast
diff --git a/src/routines/level1/xaxpy.cc b/src/routines/level1/xaxpy.cc
index e6b320d9..7646b0e4 100644
--- a/src/routines/level1/xaxpy.cc
+++ b/src/routines/level1/xaxpy.cc
@@ -29,8 +29,8 @@ template <> const Precision Xaxpy<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
- Routine(queue, event, "AXPY", {"Xaxpy"}, precision_) {
+Xaxpy<T>::Xaxpy(Queue &queue, Event &event):
+ Routine<T>(queue, event, "AXPY", {"Xaxpy"}, precision_) {
source_string_ =
#include "../../kernels/xaxpy.opencl"
;
@@ -41,8 +41,8 @@ Xaxpy<T>::Xaxpy(CommandQueue &queue, Event &event):
// The main routine
template <typename T>
StatusCode Xaxpy<T>::DoAxpy(const size_t n, const T alpha,
- const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
- const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (n == 0) { return StatusCode::kInvalidDimension; }
diff --git a/src/routines/level2/xgemv.cc b/src/routines/level2/xgemv.cc
index a7052af8..75219b63 100644
--- a/src/routines/level2/xgemv.cc
+++ b/src/routines/level2/xgemv.cc
@@ -29,8 +29,8 @@ template <> const Precision Xgemv<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xgemv<T>::Xgemv(CommandQueue &queue, Event &event):
- Routine(queue, event, "GEMV", {"Xgemv"}, precision_) {
+Xgemv<T>::Xgemv(Queue &queue, Event &event):
+ Routine<T>(queue, event, "GEMV", {"Xgemv"}, precision_) {
source_string_ =
#include "../../kernels/xgemv.opencl"
;
@@ -43,10 +43,10 @@ template <typename T>
StatusCode Xgemv<T>::DoGemv(const Layout layout, const Transpose a_transpose,
const size_t m, const size_t n,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &x_buffer, const size_t x_offset, const size_t x_inc,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &x_buffer, const size_t x_offset, const size_t x_inc,
const T beta,
- const Buffer &y_buffer, const size_t y_offset, const size_t y_inc) {
+ const Buffer<T> &y_buffer, const size_t y_offset, const size_t y_inc) {
// Makes sure all dimensions are larger than zero
if (m == 0 || n == 0) { return StatusCode::kInvalidDimension; }
diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc
index 85524891..525a82e6 100644
--- a/src/routines/level3/xgemm.cc
+++ b/src/routines/level3/xgemm.cc
@@ -29,8 +29,8 @@ template <> const Precision Xgemm<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xgemm<T>::Xgemm(CommandQueue &queue, Event &event):
- Routine(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+Xgemm<T>::Xgemm(Queue &queue, Event &event):
+ Routine<T>(queue, event, "GEMM", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
@@ -48,10 +48,10 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
const Transpose a_transpose, const Transpose b_transpose,
const size_t m, const size_t n, const size_t k,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) || (k == 0)) { return StatusCode::kInvalidDimension; }
@@ -117,9 +117,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout,
c_do_transpose == false;
// Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*m_ceiled*sizeof(T));
- auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto c_temp = (c_no_temp) ? c_buffer : Buffer(context_, CL_MEM_READ_WRITE, m_ceiled*n_ceiled*sizeof(T));
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled);
+ auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled);
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
diff --git a/src/routines/level3/xhemm.cc b/src/routines/level3/xhemm.cc
index bc257c44..a1c0c7c1 100644
--- a/src/routines/level3/xhemm.cc
+++ b/src/routines/level3/xhemm.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xhemm<T>::Xhemm(CommandQueue &queue, Event &event):
+Xhemm<T>::Xhemm(Queue &queue, Event &event):
Xgemm<T>(queue, event) {
}
@@ -32,10 +32,10 @@ template <typename T>
StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
@@ -56,7 +56,7 @@ StatusCode Xhemm<T>::DoHemm(const Layout layout, const Side side, const Triangle
// Temporary buffer for a copy of the hermitian matrix
try {
- auto temp_herm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+ auto temp_herm = Buffer<T>(context_, k*k);
// Creates a general matrix from the hermitian matrix to be able to run the regular Xgemm
// routine afterwards
diff --git a/src/routines/level3/xher2k.cc b/src/routines/level3/xher2k.cc
index fa42733f..29b2f733 100644
--- a/src/routines/level3/xher2k.cc
+++ b/src/routines/level3/xher2k.cc
@@ -27,8 +27,8 @@ template <> const Precision Xher2k<double2,double>::precision_ = Precision::kCom
// Constructor: forwards to base class constructor
template <typename T, typename U>
-Xher2k<T,U>::Xher2k(CommandQueue &queue, Event &event):
- Routine(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+Xher2k<T,U>::Xher2k(Queue &queue, Event &event):
+ Routine<T>(queue, event, "HER2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
@@ -45,10 +45,10 @@ template <typename T, typename U>
StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const U beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
@@ -105,11 +105,11 @@ StatusCode Xher2k<T,U>::DoHer2k(const Layout layout, const Triangle triangle, co
ab_rotated == false && ab_conjugate == true;
// Creates the temporary matrices
- auto a1_temp = (a1_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto a2_temp = (a2_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto b1_temp = (b1_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto b2_temp = (b2_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+ auto a1_temp = (a1_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto a2_temp = (a2_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b1_temp = (b1_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b2_temp = (b2_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
diff --git a/src/routines/level3/xherk.cc b/src/routines/level3/xherk.cc
index ae350050..5174e9ab 100644
--- a/src/routines/level3/xherk.cc
+++ b/src/routines/level3/xherk.cc
@@ -27,8 +27,8 @@ template <> const Precision Xherk<double2,double>::precision_ = Precision::kComp
// Constructor: forwards to base class constructor
template <typename T, typename U>
-Xherk<T,U>::Xherk(CommandQueue &queue, Event &event):
- Routine(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+Xherk<T,U>::Xherk(Queue &queue, Event &event):
+ Routine<T>(queue, event, "HERK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
@@ -45,9 +45,9 @@ template <typename T, typename U>
StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const U alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const U beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
@@ -98,9 +98,9 @@ StatusCode Xherk<T,U>::DoHerk(const Layout layout, const Triangle triangle, cons
a_rotated == false && b_conjugate == false;
// Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto b_temp = (b_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b_temp = (b_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
diff --git a/src/routines/level3/xsymm.cc b/src/routines/level3/xsymm.cc
index 1d17f0eb..37c08d3b 100644
--- a/src/routines/level3/xsymm.cc
+++ b/src/routines/level3/xsymm.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xsymm<T>::Xsymm(CommandQueue &queue, Event &event):
+Xsymm<T>::Xsymm(Queue &queue, Event &event):
Xgemm<T>(queue, event) {
}
@@ -32,10 +32,10 @@ template <typename T>
StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle triangle,
const size_t m, const size_t n,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0) ) { return StatusCode::kInvalidDimension; }
@@ -56,7 +56,7 @@ StatusCode Xsymm<T>::DoSymm(const Layout layout, const Side side, const Triangle
// Temporary buffer for a copy of the symmetric matrix
try {
- auto temp_symm = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+ auto temp_symm = Buffer<T>(context_, k*k);
// Creates a general matrix from the symmetric matrix to be able to run the regular Xgemm
// routine afterwards
diff --git a/src/routines/level3/xsyr2k.cc b/src/routines/level3/xsyr2k.cc
index 7ab3430a..b36e7c5e 100644
--- a/src/routines/level3/xsyr2k.cc
+++ b/src/routines/level3/xsyr2k.cc
@@ -29,8 +29,8 @@ template <> const Precision Xsyr2k<double2>::precision_ = Precision::kComplexDou
// Constructor: forwards to base class constructor
template <typename T>
-Xsyr2k<T>::Xsyr2k(CommandQueue &queue, Event &event):
- Routine(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+Xsyr2k<T>::Xsyr2k(Queue &queue, Event &event):
+ Routine<T>(queue, event, "SYR2K", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
@@ -47,10 +47,10 @@ template <typename T>
StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
const size_t n, const size_t k,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &b_buffer, const size_t b_offset, const size_t b_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld,
const T beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
@@ -99,9 +99,9 @@ StatusCode Xsyr2k<T>::DoSyr2k(const Layout layout, const Triangle triangle, cons
ab_rotated == false;
// Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto b_temp = (b_no_temp) ? b_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Runs the pre-processing kernels. This transposes the matrices A and B, but also pads zeros to
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
diff --git a/src/routines/level3/xsyrk.cc b/src/routines/level3/xsyrk.cc
index c6feb5e6..e4668216 100644
--- a/src/routines/level3/xsyrk.cc
+++ b/src/routines/level3/xsyrk.cc
@@ -29,8 +29,8 @@ template <> const Precision Xsyrk<double2>::precision_ = Precision::kComplexDoub
// Constructor: forwards to base class constructor
template <typename T>
-Xsyrk<T>::Xsyrk(CommandQueue &queue, Event &event):
- Routine(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
+Xsyrk<T>::Xsyrk(Queue &queue, Event &event):
+ Routine<T>(queue, event, "SYRK", {"Copy","Pad","Transpose","PadTranspose","Xgemm"}, precision_) {
source_string_ =
#include "../../kernels/copy.opencl"
#include "../../kernels/pad.opencl"
@@ -47,9 +47,9 @@ template <typename T>
StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
const size_t n, const size_t k,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
const T beta,
- const Buffer &c_buffer, const size_t c_offset, const size_t c_ld) {
+ const Buffer<T> &c_buffer, const size_t c_offset, const size_t c_ld) {
// Makes sure all dimensions are larger than zero
if ((n == 0) || (k == 0) ) { return StatusCode::kInvalidDimension; }
@@ -93,8 +93,8 @@ StatusCode Xsyrk<T>::DoSyrk(const Layout layout, const Triangle triangle, const
a_rotated == false;
// Creates the temporary matrices
- auto a_temp = (a_no_temp) ? a_buffer : Buffer(context_, CL_MEM_READ_WRITE, k_ceiled*n_ceiled*sizeof(T));
- auto c_temp = Buffer(context_, CL_MEM_READ_WRITE, n_ceiled*n_ceiled*sizeof(T));
+ auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*n_ceiled);
+ auto c_temp = Buffer<T>(context_, n_ceiled*n_ceiled);
// Runs the pre-processing kernel for matrix A. This transposes the matrix, but also pads zeros
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
diff --git a/src/routines/level3/xtrmm.cc b/src/routines/level3/xtrmm.cc
index 52f272e3..8be7d950 100644
--- a/src/routines/level3/xtrmm.cc
+++ b/src/routines/level3/xtrmm.cc
@@ -21,7 +21,7 @@ namespace clblast {
// Constructor: forwards to base class constructor
template <typename T>
-Xtrmm<T>::Xtrmm(CommandQueue &queue, Event &event):
+Xtrmm<T>::Xtrmm(Queue &queue, Event &event):
Xgemm<T>(queue, event) {
}
@@ -33,8 +33,8 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
const Transpose a_transpose, const Diagonal diagonal,
const size_t m, const size_t n,
const T alpha,
- const Buffer &a_buffer, const size_t a_offset, const size_t a_ld,
- const Buffer &b_buffer, const size_t b_offset, const size_t b_ld) {
+ const Buffer<T> &a_buffer, const size_t a_offset, const size_t a_ld,
+ const Buffer<T> &b_buffer, const size_t b_offset, const size_t b_ld) {
// Makes sure all dimensions are larger than zero
if ((m == 0) || (n == 0)) { return StatusCode::kInvalidDimension; }
@@ -58,7 +58,7 @@ StatusCode Xtrmm<T>::DoTrmm(const Layout layout, const Side side, const Triangle
// Temporary buffer for a copy of the triangular matrix
try {
- auto temp_triangular = Buffer(context_, CL_MEM_READ_WRITE, k*k*sizeof(T));
+ auto temp_triangular = Buffer<T>(context_, k*k);
// Creates a general matrix from the triangular matrix to be able to run the regular Xgemm
// routine afterwards