From 03182f9d07533f795a498936391da744d982e8e2 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Thu, 26 May 2016 23:36:19 +0200 Subject: Added half-precision tests for the clBLAS reference through conversion to single-precision --- include/internal/utilities.h | 10 + scripts/generator/generator.py | 29 +- scripts/generator/routine.py | 20 +- src/utilities.cc | 35 + test/correctness/testblas.cc | 1 + test/routines/level1/xamax.h | 4 +- test/routines/level1/xasum.h | 4 +- test/routines/level1/xaxpy.h | 4 +- test/routines/level1/xcopy.h | 4 +- test/routines/level1/xdot.h | 6 +- test/routines/level1/xdotc.h | 6 +- test/routines/level1/xdotu.h | 6 +- test/routines/level1/xnrm2.h | 4 +- test/routines/level1/xscal.h | 2 +- test/routines/level1/xswap.h | 4 +- test/routines/level2/xgbmv.h | 6 +- test/routines/level2/xgemv.h | 6 +- test/routines/level2/xger.h | 6 +- test/routines/level2/xgerc.h | 6 +- test/routines/level2/xgeru.h | 6 +- test/routines/level2/xhbmv.h | 6 +- test/routines/level2/xhemv.h | 6 +- test/routines/level2/xher.h | 4 +- test/routines/level2/xher2.h | 6 +- test/routines/level2/xhpmv.h | 6 +- test/routines/level2/xhpr.h | 4 +- test/routines/level2/xhpr2.h | 6 +- test/routines/level2/xsbmv.h | 6 +- test/routines/level2/xspmv.h | 6 +- test/routines/level2/xspr.h | 4 +- test/routines/level2/xspr2.h | 6 +- test/routines/level2/xsymv.h | 6 +- test/routines/level2/xsyr.h | 4 +- test/routines/level2/xsyr2.h | 6 +- test/routines/level2/xtbmv.h | 4 +- test/routines/level2/xtpmv.h | 4 +- test/routines/level2/xtrmv.h | 4 +- test/routines/level3/xgemm.h | 6 +- test/routines/level3/xhemm.h | 6 +- test/routines/level3/xher2k.h | 6 +- test/routines/level3/xherk.h | 4 +- test/routines/level3/xsymm.h | 6 +- test/routines/level3/xsyr2k.h | 6 +- test/routines/level3/xsyrk.h | 4 +- test/routines/level3/xtrmm.h | 4 +- test/wrapper_cblas.h | 10 - test/wrapper_clblas.h | 1928 +++++++++++++++++++++++----------------- 47 files changed, 1281 insertions(+), 956 deletions(-) diff --git a/include/internal/utilities.h b/include/internal/utilities.h index be7a77b4..d3c8ebdb 100644 --- a/include/internal/utilities.h +++ b/include/internal/utilities.h @@ -213,6 +213,16 @@ void PopulateVector(std::vector &vector); // ================================================================================================= +// Conversion between half and single-precision +std::vector HalfToFloatBuffer(const std::vector& source); +void FloatToHalfBuffer(std::vector& result, const std::vector& source); + +// As above, but now for OpenCL data-types instead of std::vectors +Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw); +void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw); + +// ================================================================================================= + // Rounding functions size_t CeilDiv(const size_t x, const size_t y); size_t Ceil(const size_t x, const size_t y); diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 3d07c5a3..f5fc5ecf 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -235,9 +235,11 @@ def wrapper_clblas(routines): if routine.NoScalars(): result += routine.RoutineHeaderWrapperCL(routine.template, True, 21)+";\n" for flavour in routine.flavours: - indent = " "*(17 + routine.Length()) result += routine.RoutineHeaderWrapperCL(flavour, False, 21)+" {\n" + + # There is a version available in clBLAS if flavour.precision_name in ["S","D","C","Z"]: + indent = " "*(17 + routine.Length()) arguments = routine.ArgumentsWrapperCL(flavour) if routine.scratch: result += " auto queue = Queue(queues[0]);\n" @@ -247,8 +249,27 @@ def wrapper_clblas(routines): result += " return clblas"+flavour.name+routine.name+"(" result += (",\n"+indent).join([a for a in arguments]) result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" - else: - result += " return clblasNotImplemented;" + + # There is no clBLAS available, forward the call to one of the available functions + else: # Half-precision + indent = " "*(24 + routine.Length()) + + # Convert to float (note: also integer buffers are stored as half/float) + for buf in routine.inputs + routine.outputs: + result += " auto "+buf+"_buffer_bis = HalfToFloatBuffer("+buf+"_buffer, queues[0]);\n" + + # Call the float routine + result += " auto status = clblasX"+routine.name+"(" + result += (",\n"+indent).join([a for a in routine.ArgumentsHalf()]) + result += ",\n"+indent+"num_queues, queues, num_wait_events, wait_events, events);" + result += "\n" + + # Convert back to half + for buf in routine.outputs: + result += " FloatToHalfBuffer("+buf+"_buffer, "+buf+"_buffer_bis, queues[0]);\n" + result += " return status;" + + # Complete result += "\n}\n" return result @@ -336,7 +357,7 @@ files = [ path_clblast+"/test/wrapper_clblas.h", path_clblast+"/test/wrapper_cblas.h", ] -header_lines = [84, 71, 93, 22, 29, 51] +header_lines = [84, 71, 93, 22, 29, 41] footer_lines = [17, 71, 19, 14, 6, 6] # Checks whether the command-line arguments are valid; exists otherwise diff --git a/scripts/generator/routine.py b/scripts/generator/routine.py index a347de0e..fe857ea8 100644 --- a/scripts/generator/routine.py +++ b/scripts/generator/routine.py @@ -185,6 +185,16 @@ class Routine(): return [", ".join(a+b+c)] return [] + # As above but with data-types + def BufferDefWrapperCL(self, name, flavour): + prefix = "const " if (name in self.inputs) else "" + if (name in self.inputs) or (name in self.outputs): + a = [prefix+"Buffer<"+flavour.buffertype+">& "+name+"_buffer"] + b = ["const size_t "+name+"_offset"] + c = ["const size_t "+name+"_"+self.Postfix(name)] if (name not in self.BuffersWithoutLdInc()) else [] + return [", ".join(a+b+c)] + return [] + # As above but as vectors def BufferDefVector(self, name, flavour): prefix = "const " if (name in self.inputs) else "" @@ -208,7 +218,7 @@ class Routine(): # As above but with a static cast for clBLAS wrapper def BufferWrapperCL(self, name): if (name in self.inputs) or (name in self.outputs): - a = [name+"_buffer"] + a = [name+"_buffer()"] b = [name+"_offset"] c = [] if (name in ["x","y"]): @@ -491,12 +501,12 @@ class Routine(): # As above, but clBLAS wrapper plain datatypes def ArgumentsDefWrapperCL(self, flavour): return (self.OptionsDefWrapperCL() + self.SizesDef() + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersFirst()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersFirst()])) + self.ScalarDefPlain("alpha", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersFirst()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersFirst()])) + self.ScalarDefPlain("beta", flavour) + - list(chain(*[self.BufferDef(b) for b in self.BuffersSecond()])) + - list(chain(*[self.BufferDef(b) for b in self.ScalarBuffersSecond()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.BuffersSecond()])) + + list(chain(*[self.BufferDefWrapperCL(b, flavour) for b in self.ScalarBuffersSecond()])) + list(chain(*[self.ScalarDefPlain(s, flavour) for s in self.OtherScalars()]))) # As above, but CBLAS wrapper plain datatypes diff --git a/src/utilities.cc b/src/utilities.cc index 1a7c8e45..7649b05c 100644 --- a/src/utilities.cc +++ b/src/utilities.cc @@ -299,6 +299,41 @@ void PopulateVector(std::vector &vector) { // ================================================================================================= +// Conversion between half and single-precision +std::vector HalfToFloatBuffer(const std::vector& source) { + auto result = std::vector(source.size()); + for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); } + return result; +} +void FloatToHalfBuffer(std::vector& result, const std::vector& source) { + for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); } +} + +// As above, but now for OpenCL data-types instead of std::vectors +Buffer HalfToFloatBuffer(const Buffer& source, cl_command_queue queue_raw) { + const auto size = source.GetSize() / sizeof(half); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector(size); + source.Read(queue, size, source_cpu); + auto result_cpu = HalfToFloatBuffer(source_cpu); + auto result = Buffer(context, size); + result.Write(queue, size, result_cpu); + return result; +} +void FloatToHalfBuffer(Buffer& result, const Buffer& source, cl_command_queue queue_raw) { + const auto size = source.GetSize() / sizeof(float); + auto queue = Queue(queue_raw); + auto context = queue.GetContext(); + auto source_cpu = std::vector(size); + source.Read(queue, size, source_cpu); + auto result_cpu = std::vector(size); + FloatToHalfBuffer(result_cpu, source_cpu); + result.Write(queue, size, result_cpu); +} + +// ================================================================================================= + // Rounding functions performing ceiling and division operations size_t CeilDiv(const size_t x, const size_t y) { return 1 + ((x - 1) / y); diff --git a/test/correctness/testblas.cc b/test/correctness/testblas.cc index cbf8b0a0..50871402 100644 --- a/test/correctness/testblas.cc +++ b/test/correctness/testblas.cc @@ -170,6 +170,7 @@ template void TestBlas::TestInvalid(std::vector> &test_vector, const std::string &name) { if (!PrecisionSupported(device_)) { return; } if (!compare_clblas_) { return; } + if (std::is_same::value) { return; } TestStart("invalid buffer sizes", name); // Iterates over all the to-be-tested combinations of arguments diff --git a/test/routines/level1/xamax.h b/test/routines/level1/xamax.h index 7b404dc3..12b031bc 100644 --- a/test/routines/level1/xamax.h +++ b/test/routines/level1/xamax.h @@ -86,8 +86,8 @@ class TestXamax { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXamax(args.n, - buffers.scalar(), args.imax_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.imax_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xasum.h b/test/routines/level1/xasum.h index 6eae3c83..eb83817b 100644 --- a/test/routines/level1/xasum.h +++ b/test/routines/level1/xasum.h @@ -86,8 +86,8 @@ class TestXasum { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXasum(args.n, - buffers.scalar(), args.asum_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.asum_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xaxpy.h b/test/routines/level1/xaxpy.h index 8f72f570..c241da91 100644 --- a/test/routines/level1/xaxpy.h +++ b/test/routines/level1/xaxpy.h @@ -87,8 +87,8 @@ class TestXaxpy { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXaxpy(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xcopy.h b/test/routines/level1/xcopy.h index 0527ca6a..a1ff06ce 100644 --- a/test/routines/level1/xcopy.h +++ b/test/routines/level1/xcopy.h @@ -86,8 +86,8 @@ class TestXcopy { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXcopy(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xdot.h b/test/routines/level1/xdot.h index d1c34c0f..0bbc93d5 100644 --- a/test/routines/level1/xdot.h +++ b/test/routines/level1/xdot.h @@ -91,9 +91,9 @@ class TestXdot { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdot(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xdotc.h b/test/routines/level1/xdotc.h index a2742cb0..e1cc1854 100644 --- a/test/routines/level1/xdotc.h +++ b/test/routines/level1/xdotc.h @@ -91,9 +91,9 @@ class TestXdotc { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotc(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xdotu.h b/test/routines/level1/xdotu.h index 06ce979e..558257cc 100644 --- a/test/routines/level1/xdotu.h +++ b/test/routines/level1/xdotu.h @@ -91,9 +91,9 @@ class TestXdotu { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXdotu(args.n, - buffers.scalar(), args.dot_offset, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.scalar, args.dot_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xnrm2.h b/test/routines/level1/xnrm2.h index d8a0de4e..19074ca2 100644 --- a/test/routines/level1/xnrm2.h +++ b/test/routines/level1/xnrm2.h @@ -86,8 +86,8 @@ class TestXnrm2 { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXnrm2(args.n, - buffers.scalar(), args.nrm2_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.scalar, args.nrm2_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xscal.h b/test/routines/level1/xscal.h index 35855dbd..84d14ac7 100644 --- a/test/routines/level1/xscal.h +++ b/test/routines/level1/xscal.h @@ -82,7 +82,7 @@ class TestXscal { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXscal(args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level1/xswap.h b/test/routines/level1/xswap.h index ae69d3be..e870b602 100644 --- a/test/routines/level1/xswap.h +++ b/test/routines/level1/xswap.h @@ -86,8 +86,8 @@ class TestXswap { auto queue_plain = queue(); auto event = cl_event{}; auto status = clblasXswap(args.n, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgbmv.h b/test/routines/level2/xgbmv.h index c88cdf2a..c777ff73 100644 --- a/test/routines/level2/xgbmv.h +++ b/test/routines/level2/xgbmv.h @@ -102,9 +102,9 @@ class TestXgbmv { auto status = clblasXgbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.kl, args.ku, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgemv.h b/test/routines/level2/xgemv.h index cf63d55f..f8a7e1d0 100644 --- a/test/routines/level2/xgemv.h +++ b/test/routines/level2/xgemv.h @@ -102,9 +102,9 @@ class TestXgemv { auto status = clblasXgemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.a_transpose), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xger.h b/test/routines/level2/xger.h index ae142e2e..e0d1fe49 100644 --- a/test/routines/level2/xger.h +++ b/test/routines/level2/xger.h @@ -97,9 +97,9 @@ class TestXger { auto event = cl_event{}; auto status = clblasXger(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgerc.h b/test/routines/level2/xgerc.h index b236aef6..7449146b 100644 --- a/test/routines/level2/xgerc.h +++ b/test/routines/level2/xgerc.h @@ -97,9 +97,9 @@ class TestXgerc { auto event = cl_event{}; auto status = clblasXgerc(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xgeru.h b/test/routines/level2/xgeru.h index 3d3fa439..07837657 100644 --- a/test/routines/level2/xgeru.h +++ b/test/routines/level2/xgeru.h @@ -97,9 +97,9 @@ class TestXgeru { auto event = cl_event{}; auto status = clblasXgeru(convertToCLBLAS(args.layout), args.m, args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhbmv.h b/test/routines/level2/xhbmv.h index 4098639a..73194975 100644 --- a/test/routines/level2/xhbmv.h +++ b/test/routines/level2/xhbmv.h @@ -96,9 +96,9 @@ class TestXhbmv { auto status = clblasXhbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhemv.h b/test/routines/level2/xhemv.h index 5652872d..aabbf14a 100644 --- a/test/routines/level2/xhemv.h +++ b/test/routines/level2/xhemv.h @@ -96,9 +96,9 @@ class TestXhemv { auto status = clblasXhemv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xher.h b/test/routines/level2/xher.h index 3bbf0887..1294832c 100644 --- a/test/routines/level2/xher.h +++ b/test/routines/level2/xher.h @@ -91,8 +91,8 @@ class TestXher { auto status = clblasXher(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xher2.h b/test/routines/level2/xher2.h index dc7fbe73..5e90174d 100644 --- a/test/routines/level2/xher2.h +++ b/test/routines/level2/xher2.h @@ -96,9 +96,9 @@ class TestXher2 { auto status = clblasXher2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhpmv.h b/test/routines/level2/xhpmv.h index df5a90ee..8face6b6 100644 --- a/test/routines/level2/xhpmv.h +++ b/test/routines/level2/xhpmv.h @@ -96,9 +96,9 @@ class TestXhpmv { auto status = clblasXhpmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhpr.h b/test/routines/level2/xhpr.h index 0db11db0..63cab31f 100644 --- a/test/routines/level2/xhpr.h +++ b/test/routines/level2/xhpr.h @@ -91,8 +91,8 @@ class TestXhpr { auto status = clblasXhpr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xhpr2.h b/test/routines/level2/xhpr2.h index e1e5b4c5..64d205a0 100644 --- a/test/routines/level2/xhpr2.h +++ b/test/routines/level2/xhpr2.h @@ -96,9 +96,9 @@ class TestXhpr2 { auto status = clblasXhpr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsbmv.h b/test/routines/level2/xsbmv.h index fce88f4c..3f1446c8 100644 --- a/test/routines/level2/xsbmv.h +++ b/test/routines/level2/xsbmv.h @@ -96,9 +96,9 @@ class TestXsbmv { auto status = clblasXsbmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.kl, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xspmv.h b/test/routines/level2/xspmv.h index 2fdba77a..2add3cdd 100644 --- a/test/routines/level2/xspmv.h +++ b/test/routines/level2/xspmv.h @@ -96,9 +96,9 @@ class TestXspmv { auto status = clblasXspmv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xspr.h b/test/routines/level2/xspr.h index dcacc5de..ad21bdf6 100644 --- a/test/routines/level2/xspr.h +++ b/test/routines/level2/xspr.h @@ -91,8 +91,8 @@ class TestXspr { auto status = clblasXspr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xspr2.h b/test/routines/level2/xspr2.h index 69fda2fb..c55e8181 100644 --- a/test/routines/level2/xspr2.h +++ b/test/routines/level2/xspr2.h @@ -96,9 +96,9 @@ class TestXspr2 { auto status = clblasXspr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.ap_mat(), args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.ap_mat, args.ap_offset, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsymv.h b/test/routines/level2/xsymv.h index 16f94d6f..b6583a24 100644 --- a/test/routines/level2/xsymv.h +++ b/test/routines/level2/xsymv.h @@ -96,9 +96,9 @@ class TestXsymv { auto status = clblasXsymv(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, args.beta, - buffers.y_vec(), args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, args.beta, + buffers.y_vec, args.y_offset, args.y_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsyr.h b/test/routines/level2/xsyr.h index a66dd271..f3929588 100644 --- a/test/routines/level2/xsyr.h +++ b/test/routines/level2/xsyr.h @@ -91,8 +91,8 @@ class TestXsyr { auto status = clblasXsyr(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xsyr2.h b/test/routines/level2/xsyr2.h index a36815e5..8cdb6a14 100644 --- a/test/routines/level2/xsyr2.h +++ b/test/routines/level2/xsyr2.h @@ -96,9 +96,9 @@ class TestXsyr2 { auto status = clblasXsyr2(convertToCLBLAS(args.layout), convertToCLBLAS(args.triangle), args.n, args.alpha, - buffers.x_vec(), args.x_offset, args.x_inc, - buffers.y_vec(), args.y_offset, args.y_inc, - buffers.a_mat(), args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, + buffers.y_vec, args.y_offset, args.y_inc, + buffers.a_mat, args.a_offset, args.a_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xtbmv.h b/test/routines/level2/xtbmv.h index 1425b60b..9c4131ec 100644 --- a/test/routines/level2/xtbmv.h +++ b/test/routines/level2/xtbmv.h @@ -92,8 +92,8 @@ class TestXtbmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, args.kl, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xtpmv.h b/test/routines/level2/xtpmv.h index a834b437..58249227 100644 --- a/test/routines/level2/xtpmv.h +++ b/test/routines/level2/xtpmv.h @@ -92,8 +92,8 @@ class TestXtpmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.ap_mat(), args.ap_offset, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.ap_mat, args.ap_offset, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level2/xtrmv.h b/test/routines/level2/xtrmv.h index cd502d5d..635a1319 100644 --- a/test/routines/level2/xtrmv.h +++ b/test/routines/level2/xtrmv.h @@ -92,8 +92,8 @@ class TestXtrmv { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.n, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.x_vec(), args.x_offset, args.x_inc, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.x_vec, args.x_offset, args.x_inc, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xgemm.h b/test/routines/level3/xgemm.h index cd5c2acd..842dae93 100644 --- a/test/routines/level3/xgemm.h +++ b/test/routines/level3/xgemm.h @@ -105,9 +105,9 @@ class TestXgemm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.b_transpose), args.m, args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xhemm.h b/test/routines/level3/xhemm.h index edc71024..106b99ff 100644 --- a/test/routines/level3/xhemm.h +++ b/test/routines/level3/xhemm.h @@ -105,9 +105,9 @@ class TestXhemm { convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xher2k.h b/test/routines/level3/xher2k.h index a78e1293..e2f4448f 100644 --- a/test/routines/level3/xher2k.h +++ b/test/routines/level3/xher2k.h @@ -105,9 +105,9 @@ class TestXher2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, alpha2, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xherk.h b/test/routines/level3/xherk.h index 245293d6..43d7cfcd 100644 --- a/test/routines/level3/xherk.h +++ b/test/routines/level3/xherk.h @@ -95,8 +95,8 @@ class TestXherk { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xsymm.h b/test/routines/level3/xsymm.h index e638b735..c32b4cf7 100644 --- a/test/routines/level3/xsymm.h +++ b/test/routines/level3/xsymm.h @@ -105,9 +105,9 @@ class TestXsymm { convertToCLBLAS(args.side), convertToCLBLAS(args.triangle), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xsyr2k.h b/test/routines/level3/xsyr2k.h index abac20f4..57c3c203 100644 --- a/test/routines/level3/xsyr2k.h +++ b/test/routines/level3/xsyr2k.h @@ -103,9 +103,9 @@ class TestXsyr2k { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xsyrk.h b/test/routines/level3/xsyrk.h index 8a5fcb5f..6c3a3786 100644 --- a/test/routines/level3/xsyrk.h +++ b/test/routines/level3/xsyrk.h @@ -95,8 +95,8 @@ class TestXsyrk { convertToCLBLAS(args.triangle), convertToCLBLAS(args.a_transpose), args.n, args.k, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, args.beta, - buffers.c_mat(), args.c_offset, args.c_ld, + buffers.a_mat, args.a_offset, args.a_ld, args.beta, + buffers.c_mat, args.c_offset, args.c_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/routines/level3/xtrmm.h b/test/routines/level3/xtrmm.h index 7c9c21bc..3eb63030 100644 --- a/test/routines/level3/xtrmm.h +++ b/test/routines/level3/xtrmm.h @@ -97,8 +97,8 @@ class TestXtrmm { convertToCLBLAS(args.a_transpose), convertToCLBLAS(args.diagonal), args.m, args.n, args.alpha, - buffers.a_mat(), args.a_offset, args.a_ld, - buffers.b_mat(), args.b_offset, args.b_ld, + buffers.a_mat, args.a_offset, args.a_ld, + buffers.b_mat, args.b_offset, args.b_ld, 1, &queue_plain, 0, nullptr, &event); clWaitForEvents(1, &event); return static_cast(status); diff --git a/test/wrapper_cblas.h b/test/wrapper_cblas.h index 06ce6269..bf59aa94 100644 --- a/test/wrapper_cblas.h +++ b/test/wrapper_cblas.h @@ -31,16 +31,6 @@ CBLAS_UPLO convertToCBLAS(const Triangle v) { return (v == Triangle::kUpper) ? C CBLAS_DIAG convertToCBLAS(const Diagonal v) { return (v == Diagonal::kUnit) ? CblasUnit : CblasNonUnit; } CBLAS_SIDE convertToCBLAS(const Side v) { return (v == Side::kLeft) ? CblasLeft : CblasRight; } -// Conversions from and to half-precision -std::vector HalfToFloatBuffer(const std::vector& source) { - auto result = std::vector(source.size()); - for (auto i = size_t(0); i < source.size(); ++i) { result[i] = HalfToFloat(source[i]); } - return result; -} -void FloatToHalfBuffer(std::vector& result, const std::vector& source) { - for (auto i = size_t(0); i < source.size(); ++i) { result[i] = FloatToHalf(source[i]); } -} - // OpenBLAS is not fully Netlib CBLAS compatible #ifdef OPENBLAS_VERSION using return_pointer_float = openblas_complex_float*; diff --git a/test/wrapper_clblas.h b/test/wrapper_clblas.h index 6e44d780..5115b3d9 100644 --- a/test/wrapper_clblas.h +++ b/test/wrapper_clblas.h @@ -34,104 +34,104 @@ clblasSide convertToCLBLAS(const Side v) { return (v == Side::kLeft) ? clblasLef // Forwards the clBLAS calls for SROTG/DROTG template -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, + Buffer& sb_buffer, const size_t sb_offset, + Buffer& sc_buffer, const size_t sc_offset, + Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, + Buffer& sb_buffer, const size_t sb_offset, + Buffer& sc_buffer, const size_t sc_offset, + Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, + return clblasSrotg(sa_buffer(), sa_offset, + sb_buffer(), sb_offset, + sc_buffer(), sc_offset, + ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotg(cl_mem sa_buffer, const size_t sa_offset, - cl_mem sb_buffer, const size_t sb_offset, - cl_mem sc_buffer, const size_t sc_offset, - cl_mem ss_buffer, const size_t ss_offset, +clblasStatus clblasXrotg(Buffer& sa_buffer, const size_t sa_offset, + Buffer& sb_buffer, const size_t sb_offset, + Buffer& sc_buffer, const size_t sc_offset, + Buffer& ss_buffer, const size_t ss_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotg(sa_buffer, sa_offset, - sb_buffer, sb_offset, - sc_buffer, sc_offset, - ss_buffer, ss_offset, + return clblasDrotg(sa_buffer(), sa_offset, + sb_buffer(), sb_offset, + sc_buffer(), sc_offset, + ss_buffer(), ss_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROTMG/DROTMG template -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, + Buffer& sd2_buffer, const size_t sd2_offset, + Buffer& sx1_buffer, const size_t sx1_offset, + const Buffer& sy1_buffer, const size_t sy1_offset, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, + Buffer& sd2_buffer, const size_t sd2_offset, + Buffer& sx1_buffer, const size_t sx1_offset, + const Buffer& sy1_buffer, const size_t sy1_offset, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasSrotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, + return clblasSrotmg(sd1_buffer(), sd1_offset, + sd2_buffer(), sd2_offset, + sx1_buffer(), sx1_offset, + sy1_buffer(), sy1_offset, + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> -clblasStatus clblasXrotmg(cl_mem sd1_buffer, const size_t sd1_offset, - cl_mem sd2_buffer, const size_t sd2_offset, - cl_mem sx1_buffer, const size_t sx1_offset, - const cl_mem sy1_buffer, const size_t sy1_offset, - cl_mem sparam_buffer, const size_t sparam_offset, +clblasStatus clblasXrotmg(Buffer& sd1_buffer, const size_t sd1_offset, + Buffer& sd2_buffer, const size_t sd2_offset, + Buffer& sx1_buffer, const size_t sx1_offset, + const Buffer& sy1_buffer, const size_t sy1_offset, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasDrotmg(sd1_buffer, sd1_offset, - sd2_buffer, sd2_offset, - sx1_buffer, sx1_offset, - sy1_buffer, sy1_offset, - sparam_buffer, sparam_offset, + return clblasDrotmg(sd1_buffer(), sd1_offset, + sd2_buffer(), sd2_offset, + sx1_buffer(), sx1_offset, + sy1_buffer(), sy1_offset, + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SROT/DROT clblasStatus clblasXrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, const float cos, const float sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrot(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXrot(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, const double cos, const double sin, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrot(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), cos, sin, num_queues, queues, num_wait_events, wait_events, events); @@ -140,356 +140,394 @@ clblasStatus clblasXrot(const size_t n, // Forwards the clBLAS calls for SROTM/DROTM template clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSrotm(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - sparam_buffer, sparam_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXrotm(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem sparam_buffer, const size_t sparam_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& sparam_buffer, const size_t sparam_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDrotm(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - sparam_buffer, sparam_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + sparam_buffer(), sparam_offset, num_queues, queues, num_wait_events, wait_events, events); } // Forwards the clBLAS calls for SSWAP/DSWAP/CSWAP/ZSWAP template clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZswap(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXswap(const size_t n, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXswap(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSCAL/DSCAL/CSCAL/ZSCAL clblasStatus clblasXscal(const size_t n, const float alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSscal(n, alpha, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDscal(n, alpha, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const float2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCscal(n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const double2 alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZscal(n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXscal(const size_t n, const half alpha, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXscal(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SCOPY/DCOPY/CCOPY/ZCOPY template clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasScopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDcopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCcopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZcopy(n, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXcopy(const size_t n, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXcopy(n, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SAXPY/DAXPY/CAXPY/ZAXPY clblasStatus clblasXaxpy(const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSaxpy(n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDaxpy(n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCaxpy(n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZaxpy(n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXaxpy(const size_t n, const half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXaxpy(n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SDOT/DDOT template clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasSdot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDdot(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdot(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto dot_buffer_bis = HalfToFloatBuffer(dot_buffer, queues[0]); + auto status = clblasXdot(n, + dot_buffer_bis, dot_offset, + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(dot_buffer, dot_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for CDOTU/ZDOTU template clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCdotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotu(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZdotu(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } @@ -497,42 +535,42 @@ clblasStatus clblasXdotu(const size_t n, // Forwards the clBLAS calls for CDOTC/ZDOTC template clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasCdotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXdotc(const size_t n, - cl_mem dot_buffer, const size_t dot_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& dot_buffer, const size_t dot_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasZdotc(n, - dot_buffer, dot_offset, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), + dot_buffer(), dot_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } @@ -540,229 +578,250 @@ clblasStatus clblasXdotc(const size_t n, // Forwards the clBLAS calls for SNRM2/DNRM2/ScNRM2/DzNRM2 template clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasSnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasDnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasScnrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasDznrm2(n, - nrm2_buffer, nrm2_offset, - x_buffer, x_offset, static_cast(x_inc), + nrm2_buffer(), nrm2_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXnrm2(const size_t n, - cl_mem nrm2_buffer, const size_t nrm2_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& nrm2_buffer, const size_t nrm2_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto nrm2_buffer_bis = HalfToFloatBuffer(nrm2_buffer, queues[0]); + auto status = clblasXnrm2(n, + nrm2_buffer_bis, nrm2_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(nrm2_buffer, nrm2_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SASUM/DASUM/ScASUM/DzASUM template clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasSasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasScasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, n); return clblasDzasum(n, - asum_buffer, asum_offset, - x_buffer, x_offset, static_cast(x_inc), + asum_buffer(), asum_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXasum(const size_t n, - cl_mem asum_buffer, const size_t asum_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& asum_buffer, const size_t asum_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto asum_buffer_bis = HalfToFloatBuffer(asum_buffer, queues[0]); + auto status = clblasXasum(n, + asum_buffer_bis, asum_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(asum_buffer, asum_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiSamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiDamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiCamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); auto context = queue.GetContext(); auto scratch_buffer = Buffer(context, 2*n); return clblasiZamax(n, - imax_buffer, imax_offset, - x_buffer, x_offset, static_cast(x_inc), + imax_buffer(), imax_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXamax(const size_t n, - cl_mem imax_buffer, const size_t imax_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& imax_buffer, const size_t imax_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto imax_buffer_bis = HalfToFloatBuffer(imax_buffer, queues[0]); + auto status = clblasXamax(n, + imax_buffer_bis, imax_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(imax_buffer, imax_buffer_bis, queues[0]); + return status; } // ================================================================================================= @@ -773,207 +832,231 @@ clblasStatus clblasXamax(const size_t n, clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemv(layout, a_transpose, m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemv(layout, a_transpose, m, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemv(layout, a_transpose, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemv(layout, a_transpose, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXgemv(layout, a_transpose, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SGBMV/DGBMV/CGBMV/ZGBMV clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgbmv(layout, a_transpose, m, n, kl, ku, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgbmv(layout, a_transpose, m, n, kl, ku, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgbmv(layout, a_transpose, m, n, kl, ku, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgbmv(layout, a_transpose, m, n, kl, ku, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgbmv(const clblasOrder layout, const clblasTranspose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXgbmv(layout, a_transpose, + m, n, kl, ku, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for CHEMV/ZHEMV clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -981,37 +1064,37 @@ clblasStatus clblasXhemv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChbmv(layout, triangle, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhbmv(layout, triangle, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1019,37 +1102,37 @@ clblasStatus clblasXhbmv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpmv(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), cl_float2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double2 beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpmv(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), cl_double2{{beta.real(), beta.imag()}}, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1057,162 +1140,198 @@ clblasStatus clblasXhpmv(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymv(layout, triangle, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymv(layout, triangle, n, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXsymv(layout, triangle, + n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSBMV/DSBMV clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsbmv(layout, triangle, n, k, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsbmv(layout, triangle, n, k, alpha, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsbmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const size_t k, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXsbmv(layout, triangle, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSPMV/DSPMV clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const float beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspmv(layout, triangle, n, alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const double beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspmv(layout, triangle, n, alpha, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), beta, - y_buffer, y_offset, static_cast(y_inc), + y_buffer(), y_offset, static_cast(y_inc), num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspmv(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, - const cl_mem ap_buffer, const size_t ap_offset, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, const half beta, - cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto status = clblasXspmv(layout, triangle, + n, + HalfToFloat(alpha), + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + HalfToFloat(beta), + y_buffer_bis, y_offset, y_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(y_buffer, y_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for STRMV/DTRMV/CTRMV/ZTRMV template clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1220,16 +1339,16 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer(context, n); return clblasStrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1237,16 +1356,16 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasDtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1254,16 +1373,16 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasCtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1271,34 +1390,42 @@ clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer(context, n); return clblasZtrmv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtrmv(layout, triangle, a_transpose, diagonal, + n, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for STBMV/DTBMV/CTBMV/ZTBMV template clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1306,16 +1433,16 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer(context, n); return clblasStbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1323,16 +1450,16 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasDtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1340,16 +1467,16 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasCtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1357,34 +1484,42 @@ clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer(context, n); return clblasZtbmv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtbmv(layout, triangle, a_transpose, diagonal, + n, k, + a_buffer_bis, a_offset, a_ld, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for STPMV/DTPMV/CTPMV/ZTPMV template clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1392,16 +1527,16 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo trian auto scratch_buffer = Buffer(context, n); return clblasStpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1409,16 +1544,16 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasDtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1426,16 +1561,16 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tria auto scratch_buffer = Buffer(context, n); return clblasCtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { auto queue = Queue(queues[0]); @@ -1443,79 +1578,87 @@ clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo tri auto scratch_buffer = Buffer(context, n); return clblasZtpmv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), scratch_buffer(), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpmv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto status = clblasXtpmv(layout, triangle, a_transpose, diagonal, + n, + ap_buffer_bis, ap_offset, + x_buffer_bis, x_offset, x_inc, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(x_buffer, x_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for STRSV/DTRSV/CTRSV/ZTRSV template clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsv(layout, triangle, a_transpose, diagonal, n, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1523,60 +1666,60 @@ clblasStatus clblasXtrsv(const clblasOrder layout, const clblasUplo tri template clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, const size_t k, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtbsv(layout, triangle, a_transpose, diagonal, n, k, - a_buffer, a_offset, a_ld, - x_buffer, x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1584,60 +1727,60 @@ clblasStatus clblasXtbsv(const clblasOrder layout, const clblasUplo tri template clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events); template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } template <> clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t n, - const cl_mem ap_buffer, const size_t ap_offset, - cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& ap_buffer, const size_t ap_offset, + Buffer& x_buffer, const size_t x_offset, const size_t x_inc, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtpsv(layout, triangle, a_transpose, diagonal, n, - ap_buffer, ap_offset, - x_buffer, x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, + x_buffer(), x_offset, static_cast(x_inc), num_queues, queues, num_wait_events, wait_events, events); } @@ -1645,77 +1788,88 @@ clblasStatus clblasXtpsv(const clblasOrder layout, const clblasUplo tri clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSger(layout, m, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDger(layout, m, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXger(const clblasOrder layout, const size_t m, const size_t n, const half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXger(layout, + m, n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for CGERU/ZGERU clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgeru(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgeru(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgeru(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1723,33 +1877,33 @@ clblasStatus clblasXgeru(const clblasOrder layout, clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgerc(layout, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgerc(const clblasOrder layout, const size_t m, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgerc(layout, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1757,29 +1911,29 @@ clblasStatus clblasXgerc(const clblasOrder layout, clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1787,29 +1941,29 @@ clblasStatus clblasXher(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } @@ -1817,33 +1971,33 @@ clblasStatus clblasXhpr(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -1851,33 +2005,33 @@ clblasStatus clblasXher2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChpr2(layout, triangle, n, cl_float2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double2 alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhpr2(layout, triangle, n, cl_double2{{alpha.real(), alpha.imag()}}, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } @@ -1885,166 +2039,206 @@ clblasStatus clblasXhpr2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXsyr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSPR/DSPR clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto status = clblasXspr(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + ap_buffer_bis, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSYR2/DSYR2 clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - a_buffer, a_offset, a_ld, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + a_buffer(), a_offset, a_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto status = clblasXsyr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + a_buffer_bis, a_offset, a_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(a_buffer, a_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSPR2/DSPR2 clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const float alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSspr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const double alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDspr2(layout, triangle, n, alpha, - x_buffer, x_offset, static_cast(x_inc), - y_buffer, y_offset, static_cast(y_inc), - ap_buffer, ap_offset, + x_buffer(), x_offset, static_cast(x_inc), + y_buffer(), y_offset, static_cast(y_inc), + ap_buffer(), ap_offset, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, const size_t n, const half alpha, - const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, - const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, - cl_mem ap_buffer, const size_t ap_offset, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + const Buffer& x_buffer, const size_t x_offset, const size_t x_inc, + const Buffer& y_buffer, const size_t y_offset, const size_t y_inc, + Buffer& ap_buffer, const size_t ap_offset, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto x_buffer_bis = HalfToFloatBuffer(x_buffer, queues[0]); + auto y_buffer_bis = HalfToFloatBuffer(y_buffer, queues[0]); + auto ap_buffer_bis = HalfToFloatBuffer(ap_buffer, queues[0]); + auto status = clblasXspr2(layout, triangle, + n, + HalfToFloat(alpha), + x_buffer_bis, x_offset, x_inc, + y_buffer_bis, y_offset, y_inc, + ap_buffer_bis, ap_offset, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(ap_buffer, ap_buffer_bis, queues[0]); + return status; } // ================================================================================================= @@ -2055,207 +2249,231 @@ clblasStatus clblasXspr2(const clblasOrder layout, const clblasUplo triangle, clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSgemm(layout, a_transpose, b_transpose, m, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDgemm(layout, a_transpose, b_transpose, m, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCgemm(layout, a_transpose, b_transpose, m, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZgemm(layout, a_transpose, b_transpose, m, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXgemm(const clblasOrder layout, const clblasTranspose a_transpose, const clblasTranspose b_transpose, const size_t m, const size_t n, const size_t k, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXgemm(layout, a_transpose, b_transpose, + m, n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for SSYMM/DSYMM/CSYMM/ZSYMM clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsymm(layout, side, triangle, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsymm(layout, side, triangle, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsymm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsymm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsymm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsymm(layout, side, triangle, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for CHEMM/ZHEMM clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasChemm(layout, side, triangle, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZhemm(layout, side, triangle, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2263,109 +2481,119 @@ clblasStatus clblasXhemm(const clblasOrder layout, const clblasSide side, const clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyrk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyrk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyrk(layout, triangle, a_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyrk(layout, triangle, a_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyrk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsyrk(layout, triangle, a_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for CHERK/ZHERK clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCherk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose a_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZherk(layout, triangle, a_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, + a_buffer(), a_offset, a_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2373,122 +2601,134 @@ clblasStatus clblasXherk(const clblasOrder layout, const clblasUplo triangle, co clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasSsyr2k(layout, triangle, ab_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDsyr2k(layout, triangle, ab_transpose, n, k, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCsyr2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_float2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double2 beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZsyr2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, cl_double2{{beta.real(), beta.imag()}}, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXsyr2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const half beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto c_buffer_bis = HalfToFloatBuffer(c_buffer, queues[0]); + auto status = clblasXsyr2k(layout, triangle, ab_transpose, + n, k, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + HalfToFloat(beta), + c_buffer_bis, c_offset, c_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(c_buffer, c_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for CHER2K/ZHER2K clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const float beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCher2k(layout, triangle, ab_transpose, n, k, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, const clblasTranspose ab_transpose, const size_t n, const size_t k, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + const Buffer& b_buffer, const size_t b_offset, const size_t b_ld, const double beta, - cl_mem c_buffer, const size_t c_offset, const size_t c_ld, + Buffer& c_buffer, const size_t c_offset, const size_t c_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZher2k(layout, triangle, ab_transpose, n, k, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, beta, - c_buffer, c_offset, c_ld, + c_buffer(), c_offset, c_ld, num_queues, queues, num_wait_events, wait_events, events); } @@ -2496,134 +2736,152 @@ clblasStatus clblasXher2k(const clblasOrder layout, const clblasUplo triangle, c clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrmm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrmm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto status = clblasXtrmm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); + return status; } // Forwards the clBLAS calls for STRSM/DTRSM/CTRSM/ZTRSM clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasStrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasDtrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const float2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasCtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_float2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const double2 alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, cl_uint num_queues, cl_command_queue *queues, cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { return clblasZtrsm(layout, side, triangle, a_transpose, diagonal, m, n, cl_double2{{alpha.real(), alpha.imag()}}, - a_buffer, a_offset, a_ld, - b_buffer, b_offset, b_ld, + a_buffer(), a_offset, a_ld, + b_buffer(), b_offset, b_ld, num_queues, queues, num_wait_events, wait_events, events); } clblasStatus clblasXtrsm(const clblasOrder layout, const clblasSide side, const clblasUplo triangle, const clblasTranspose a_transpose, const clblasDiag diagonal, const size_t m, const size_t n, const half alpha, - const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, - cl_mem b_buffer, const size_t b_offset, const size_t b_ld, - cl_uint num_queues, cl_command_queue *queues, - cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { - return clblasNotImplemented; + const Buffer& a_buffer, const size_t a_offset, const size_t a_ld, + Buffer& b_buffer, const size_t b_offset, const size_t b_ld, + cl_uint num_queues, cl_command_queue *queues, + cl_uint num_wait_events, const cl_event *wait_events, cl_event *events) { + auto a_buffer_bis = HalfToFloatBuffer(a_buffer, queues[0]); + auto b_buffer_bis = HalfToFloatBuffer(b_buffer, queues[0]); + auto status = clblasXtrsm(layout, side, triangle, a_transpose, diagonal, + m, n, + HalfToFloat(alpha), + a_buffer_bis, a_offset, a_ld, + b_buffer_bis, b_offset, b_ld, + num_queues, queues, num_wait_events, wait_events, events); + FloatToHalfBuffer(b_buffer, b_buffer_bis, queues[0]); + return status; } // ================================================================================================= -- cgit v1.2.3