// ================================================================================================= // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- // width of 100 characters per line. // // Author(s): // Cedric Nugteren // // This file implements all the BLAS API calls. In all cases, it does not much more than creating // a new object of the appropriate type, and calling the main routine on that object. It forwards // all status codes to the caller. // // ================================================================================================= #include #include "cache.hpp" #include "clblast.h" // BLAS level-1 includes #include "routines/level1/xswap.hpp" #include "routines/level1/xscal.hpp" #include "routines/level1/xcopy.hpp" #include "routines/level1/xaxpy.hpp" #include "routines/level1/xdot.hpp" #include "routines/level1/xdotu.hpp" #include "routines/level1/xdotc.hpp" #include "routines/level1/xnrm2.hpp" #include "routines/level1/xasum.hpp" #include "routines/level1/xsum.hpp" // non-BLAS routine #include "routines/level1/xamax.hpp" #include "routines/level1/xmax.hpp" // non-BLAS routine #include "routines/level1/xmin.hpp" // non-BLAS routine // BLAS level-2 includes #include "routines/level2/xgemv.hpp" #include "routines/level2/xgbmv.hpp" #include "routines/level2/xhemv.hpp" #include "routines/level2/xhbmv.hpp" #include "routines/level2/xhpmv.hpp" #include "routines/level2/xsymv.hpp" #include "routines/level2/xsbmv.hpp" #include "routines/level2/xspmv.hpp" #include "routines/level2/xtrmv.hpp" #include "routines/level2/xtbmv.hpp" #include "routines/level2/xtpmv.hpp" #include "routines/level2/xtrsv.hpp" #include "routines/level2/xger.hpp" #include "routines/level2/xgeru.hpp" #include "routines/level2/xgerc.hpp" #include "routines/level2/xher.hpp" #include "routines/level2/xhpr.hpp" #include "routines/level2/xher2.hpp" #include "routines/level2/xhpr2.hpp" #include "routines/level2/xsyr.hpp" #include "routines/level2/xspr.hpp" #include "routines/level2/xsyr2.hpp" #include "routines/level2/xspr2.hpp" // BLAS level-3 includes #include "routines/level3/xgemm.hpp" #include "routines/level3/xsymm.hpp" #include "routines/level3/xhemm.hpp" #include "routines/level3/xsyrk.hpp" #include "routines/level3/xherk.hpp" #include "routines/level3/xsyr2k.hpp" #include "routines/level3/xher2k.hpp" #include "routines/level3/xtrmm.hpp" #include "routines/level3/xtrsm.hpp" // Level-x includes (non-BLAS) #include "routines/levelx/xomatcopy.hpp" #include "routines/levelx/xaxpybatched.hpp" namespace clblast { // ================================================================================================= // BLAS level-1 (vector-vector) routines // ================================================================================================= // Generate givens plane rotation: SROTG/DROTG template StatusCode Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Generate modified givens plane rotation: SROTMG/DROTMG template StatusCode Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotmg(cl_mem, const size_t, cl_mem, const size_t, cl_mem, const size_t, const cl_mem, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Apply givens plane rotation: SROT/DROT template StatusCode Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const T, const T, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const float, const float, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rot(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, const double, const double, cl_command_queue*, cl_event*); // Apply modified givens plane rotation: SROTM/DROTM template StatusCode Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Rotm(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Swap two vectors: SSWAP/DSWAP/CSWAP/ZSWAP/HSWAP template StatusCode Swap(const size_t n, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xswap(queue_cpp, event); routine.DoSwap(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Swap(const size_t, cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector scaling: SSCAL/DSCAL/CSCAL/ZSCAL/HSCAL template StatusCode Scal(const size_t n, const T alpha, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xscal(queue_cpp, event); routine.DoScal(n, alpha, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Scal(const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Scal(const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector copy: SCOPY/DCOPY/CCOPY/ZCOPY/HCOPY template StatusCode Copy(const size_t n, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xcopy(queue_cpp, event); routine.DoCopy(n, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Copy(const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Vector-times-constant plus vector: SAXPY/DAXPY/CAXPY/ZAXPY/HAXPY template StatusCode Axpy(const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xaxpy(queue_cpp, event); routine.DoAxpy(n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Axpy(const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Axpy(const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two vectors: SDOT/DDOT/HDOT template StatusCode Dot(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdot(queue_cpp, event); routine.DoDot(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dot(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two complex vectors: CDOTU/ZDOTU template StatusCode Dotu(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdotu(queue_cpp, event); routine.DoDotu(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dotu(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Dot product of two complex vectors, one conjugated: CDOTC/ZDOTC template StatusCode Dotc(const size_t n, cl_mem dot_buffer, const size_t dot_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xdotc(queue_cpp, event); routine.DoDotc(n, Buffer(dot_buffer), dot_offset, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Dotc(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Euclidian norm of a vector: SNRM2/DNRM2/ScNRM2/DzNRM2/HNRM2 template StatusCode Nrm2(const size_t n, cl_mem nrm2_buffer, const size_t nrm2_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xnrm2(queue_cpp, event); routine.DoNrm2(n, Buffer(nrm2_buffer), nrm2_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Nrm2(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Absolute sum of values in a vector: SASUM/DASUM/ScASUM/DzASUM/HASUM template StatusCode Asum(const size_t n, cl_mem asum_buffer, const size_t asum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xasum(queue_cpp, event); routine.DoAsum(n, Buffer(asum_buffer), asum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Asum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Sum of values in a vector (non-BLAS function): SSUM/DSUM/ScSUM/DzSUM/HSUM template StatusCode Sum(const size_t n, cl_mem sum_buffer, const size_t sum_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsum(queue_cpp, event); routine.DoSum(n, Buffer(sum_buffer), sum_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sum(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of absolute maximum value in a vector: iSAMAX/iDAMAX/iCAMAX/iZAMAX/iHAMAX template StatusCode Amax(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xamax(queue_cpp, event); routine.DoAmax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Amax(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of maximum value in a vector (non-BLAS function): iSMAX/iDMAX/iCMAX/iZMAX/iHMAX template StatusCode Max(const size_t n, cl_mem imax_buffer, const size_t imax_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xmax(queue_cpp, event); routine.DoMax(n, Buffer(imax_buffer), imax_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Max(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Index of minimum value in a vector (non-BLAS function): iSMIN/iDMIN/iCMIN/iZMIN/iHMIN template StatusCode Min(const size_t n, cl_mem imin_buffer, const size_t imin_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xmin(queue_cpp, event); routine.DoMin(n, Buffer(imin_buffer), imin_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Min(const size_t, cl_mem, const size_t, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-2 (matrix-vector) routines // ================================================================================================= // General matrix-vector multiplication: SGEMV/DGEMV/CGEMV/ZGEMV/HGEMV template StatusCode Gemv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgemv(queue_cpp, event); routine.DoGemv(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemv(const Layout, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General banded matrix-vector multiplication: SGBMV/DGBMV/CGBMV/ZGBMV/HGBMV template StatusCode Gbmv(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const size_t kl, const size_t ku, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgbmv(queue_cpp, event); routine.DoGbmv(layout, a_transpose, m, n, kl, ku, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gbmv(const Layout, const Transpose, const size_t, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian matrix-vector multiplication: CHEMV/ZHEMV template StatusCode Hemv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhemv(queue_cpp, event); routine.DoHemv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hemv(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian banded matrix-vector multiplication: CHBMV/ZHBMV template StatusCode Hbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhbmv(queue_cpp, event); routine.DoHbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hbmv(const Layout, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed matrix-vector multiplication: CHPMV/ZHPMV template StatusCode Hpmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpmv(queue_cpp, event); routine.DoHpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpmv(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric matrix-vector multiplication: SSYMV/DSYMV/HSYMV template StatusCode Symv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsymv(queue_cpp, event); routine.DoSymv(layout, triangle, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symv(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric banded matrix-vector multiplication: SSBMV/DSBMV/HSBMV template StatusCode Sbmv(const Layout layout, const Triangle triangle, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsbmv(queue_cpp, event); routine.DoSbmv(layout, triangle, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Sbmv(const Layout, const Triangle, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed matrix-vector multiplication: SSPMV/DSPMV/HSPMV template StatusCode Spmv(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem ap_buffer, const size_t ap_offset, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const T beta, cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspmv(queue_cpp, event); routine.DoSpmv(layout, triangle, n, alpha, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc, beta, Buffer(y_buffer), y_offset, y_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spmv(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular matrix-vector multiplication: STRMV/DTRMV/CTRMV/ZTRMV/HTRMV template StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrmv(queue_cpp, event); routine.DoTrmv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular banded matrix-vector multiplication: STBMV/DTBMV/CTBMV/ZTBMV/HTBMV template StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const size_t k, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtbmv(queue_cpp, event); routine.DoTbmv(layout, triangle, a_transpose, diagonal, n, k, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular packed matrix-vector multiplication: STPMV/DTPMV/CTPMV/ZTPMV/HTPMV template StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem ap_buffer, const size_t ap_offset, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtpmv(queue_cpp, event); routine.DoTpmv(layout, triangle, a_transpose, diagonal, n, Buffer(ap_buffer), ap_offset, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpmv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSV/DTRSV/CTRSV/ZTRSV template StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t n, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrsv(queue_cpp, event); routine.DoTrsv(layout, triangle, a_transpose, diagonal, n, Buffer(a_buffer), a_offset, a_ld, Buffer(x_buffer), x_offset, x_inc); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a banded triangular system of equations: STBSV/DTBSV/CTBSV/ZTBSV template StatusCode Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tbsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a packed triangular system of equations: STPSV/DTPSV/CTPSV/ZTPSV template StatusCode Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*) { return StatusCode::kNotImplemented; } template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Tpsv(const Layout, const Triangle, const Transpose, const Diagonal, const size_t, const cl_mem, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 matrix update: SGER/DGER/HGER template StatusCode Ger(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xger(queue_cpp, event); routine.DoGer(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Ger(const Layout, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 complex matrix update: CGERU/ZGERU template StatusCode Geru(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgeru(queue_cpp, event); routine.DoGeru(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Geru(const Layout, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // General rank-1 complex conjugated matrix update: CGERC/ZGERC template StatusCode Gerc(const Layout layout, const size_t m, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgerc(queue_cpp, event); routine.DoGerc(layout, m, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gerc(const Layout, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian rank-1 matrix update: CHER/ZHER template StatusCode Her(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher,T>(queue_cpp, event); routine.DoHer(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed rank-1 matrix update: CHPR/ZHPR template StatusCode Hpr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpr,T>(queue_cpp, event); routine.DoHpr(layout, triangle, n, alpha, Buffer>(x_buffer), x_offset, x_inc, Buffer>(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Hermitian rank-2 matrix update: CHER2/ZHER2 template StatusCode Her2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher2(queue_cpp, event); routine.DoHer2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her2(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian packed rank-2 matrix update: CHPR2/ZHPR2 template StatusCode Hpr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhpr2(queue_cpp, event); routine.DoHpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hpr2(const Layout, const Triangle, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr(queue_cpp, event); routine.DoSyr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspr(queue_cpp, event); routine.DoSpr(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr2(queue_cpp, event); routine.DoSyr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(a_buffer), a_offset, a_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, const T alpha, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xspr2(queue_cpp, event); routine.DoSpr2(layout, triangle, n, alpha, Buffer(x_buffer), x_offset, x_inc, Buffer(y_buffer), y_offset, y_inc, Buffer(ap_buffer), ap_offset); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Spr2(const Layout, const Triangle, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines // ================================================================================================= // General matrix-matrix multiplication: SGEMM/DGEMM/CGEMM/ZGEMM/HGEMM template StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose, const size_t m, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xgemm(queue_cpp, event); routine.DoGemm(layout, a_transpose, b_transpose, m, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Gemm(const Layout, const Transpose, const Transpose, const size_t, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Symmetric matrix-matrix multiplication: SSYMM/DSYMM/CSYMM/ZSYMM/HSYMM template StatusCode Symm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsymm(queue_cpp, event); routine.DoSymm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Symm(const Layout, const Side, const Triangle, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Hermitian matrix-matrix multiplication: CHEMM/ZHEMM template StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xhemm(queue_cpp, event); routine.DoHemm(layout, side, triangle, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Hemm(const Layout, const Side, const Triangle, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-K update of a symmetric matrix: SSYRK/DSYRK/CSYRK/ZSYRK/HSYRK template StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyrk(queue_cpp, event); routine.DoSyrk(layout, triangle, a_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syrk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-K update of a hermitian matrix: CHERK/ZHERK template StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xherk,T>(queue_cpp, event); routine.DoHerk(layout, triangle, a_transpose, n, k, alpha, Buffer>(a_buffer), a_offset, a_ld, beta, Buffer>(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Herk(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-2K update of a symmetric matrix: SSYR2K/DSYR2K/CSYR2K/ZSYR2K/HSYR2K template StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const T beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xsyr2k(queue_cpp, event); routine.DoSyr2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double2, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Syr2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const half, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Rank-2K update of a hermitian matrix: CHER2K/ZHER2K template StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose, const size_t n, const size_t k, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const U beta, cl_mem c_buffer, const size_t c_offset, const size_t c_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xher2k(queue_cpp, event); routine.DoHer2k(layout, triangle, ab_transpose, n, k, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld, beta, Buffer(c_buffer), c_offset, c_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const float, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Her2k(const Layout, const Triangle, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, const cl_mem, const size_t, const size_t, const double, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Triangular matrix-matrix multiplication: STRMM/DTRMM/CTRMM/ZTRMM/HTRMM template StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrmm(queue_cpp, event); routine.DoTrmm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trmm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Solves a triangular system of equations: STRSM/DTRSM/CTRSM/ZTRSM template StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xtrsm(queue_cpp, event); routine.DoTrsm(layout, side, triangle, a_transpose, diagonal, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Trsm(const Layout, const Side, const Triangle, const Transpose, const Diagonal, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // Extra non-BLAS routines (level-X) // ================================================================================================= // Scaling and out-place transpose/copy (non-BLAS function): SOMATCOPY/DOMATCOPY/COMATCOPY/ZOMATCOPY/HOMATCOPY template StatusCode Omatcopy(const Layout layout, const Transpose a_transpose, const size_t m, const size_t n, const T alpha, const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_mem b_buffer, const size_t b_offset, const size_t b_ld, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = Xomatcopy(queue_cpp, event); routine.DoOmatcopy(layout, a_transpose, m, n, alpha, Buffer(a_buffer), a_offset, a_ld, Buffer(b_buffer), b_offset, b_ld); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const float2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const double2, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API Omatcopy(const Layout, const Transpose, const size_t, const size_t, const half, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); // Batched version of AXPY: SAXPYBATCHED/DAXPYBATCHED/CAXPYBATCHED/ZAXPYBATCHED/HAXPYBATCHED template StatusCode AxpyBatched(const size_t n, const T *alphas, const cl_mem *x_buffers, const size_t x_offset, const size_t x_inc, cl_mem *y_buffers, const size_t y_offset, const size_t y_inc, const size_t batch_count, cl_command_queue* queue, cl_event* event) { try { auto queue_cpp = Queue(*queue); auto routine = XaxpyBatched(queue_cpp, event); auto alphas_cpp = std::vector(); auto x_buffers_cpp = std::vector>(); auto y_buffers_cpp = std::vector>(); for (auto batch = size_t{0}; batch < batch_count; ++batch) { alphas_cpp.push_back(alphas[batch]); x_buffers_cpp.push_back(Buffer(x_buffers[batch])); y_buffers_cpp.push_back(Buffer(y_buffers[batch])); } routine.DoAxpyBatched(n, alphas_cpp, x_buffers_cpp, x_offset, x_inc, y_buffers_cpp, y_offset, y_inc, batch_count); return StatusCode::kSuccess; } catch (...) { return DispatchException(); } } template StatusCode PUBLIC_API AxpyBatched(const size_t, const float*, const cl_mem*, const size_t, const size_t, cl_mem*, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double*, const cl_mem*, const size_t, const size_t, cl_mem*, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const float2*, const cl_mem*, const size_t, const size_t, cl_mem*, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const double2*, const cl_mem*, const size_t, const size_t, cl_mem*, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); template StatusCode PUBLIC_API AxpyBatched(const size_t, const half*, const cl_mem*, const size_t, const size_t, cl_mem*, const size_t, const size_t, const size_t, cl_command_queue*, cl_event*); // ================================================================================================= // Clears the cache of stored binaries StatusCode ClearCache() { try { ProgramCache::Instance().Invalidate(); BinaryCache::Instance().Invalidate(); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } template void FillCacheForPrecision(Queue &queue) { try { // Runs all the level 1 set-up functions Xswap(queue, nullptr); Xswap(queue, nullptr); Xswap(queue, nullptr); Xswap(queue, nullptr); Xscal(queue, nullptr); Xscal(queue, nullptr); Xcopy(queue, nullptr); Xcopy(queue, nullptr); Xaxpy(queue, nullptr); Xaxpy(queue, nullptr); Xdot(queue, nullptr); Xdotu(queue, nullptr); Xdotc(queue, nullptr); Xnrm2(queue, nullptr); Xnrm2(queue, nullptr); Xasum(queue, nullptr); Xasum(queue, nullptr); Xsum(queue, nullptr); Xsum(queue, nullptr); Xamax(queue, nullptr); Xamax(queue, nullptr); Xmax(queue, nullptr); Xmax(queue, nullptr); Xmin(queue, nullptr); Xmin(queue, nullptr); // Runs all the level 2 set-up functions Xgemv(queue, nullptr); Xgemv(queue, nullptr); Xgbmv(queue, nullptr); Xgbmv(queue, nullptr); Xhemv(queue, nullptr); Xhbmv(queue, nullptr); Xhpmv(queue, nullptr); Xsymv(queue, nullptr); Xsbmv(queue, nullptr); Xspmv(queue, nullptr); Xtrmv(queue, nullptr); Xtrmv(queue, nullptr); Xtbmv(queue, nullptr); Xtbmv(queue, nullptr); Xtpmv(queue, nullptr); Xtpmv(queue, nullptr); Xger(queue, nullptr); Xgeru(queue, nullptr); Xgerc(queue, nullptr); Xher(queue, nullptr); Xhpr(queue, nullptr); Xher2(queue, nullptr); Xhpr2(queue, nullptr); Xsyr(queue, nullptr); Xspr(queue, nullptr); Xsyr2(queue, nullptr); Xspr2(queue, nullptr); // Runs all the level 3 set-up functions Xgemm(queue, nullptr); Xgemm(queue, nullptr); Xsymm(queue, nullptr); Xsymm(queue, nullptr); Xhemm(queue, nullptr); Xsyrk(queue, nullptr); Xsyrk(queue, nullptr); Xherk(queue, nullptr); Xsyr2k(queue, nullptr); Xsyr2k(queue, nullptr); Xher2k(queue, nullptr); Xtrmm(queue, nullptr); Xtrmm(queue, nullptr); // Runs all the non-BLAS set-up functions Xomatcopy(queue, nullptr); Xomatcopy(queue, nullptr); } catch(const RuntimeErrorCode &e) { if (e.status() != StatusCode::kNoDoublePrecision && e.status() != StatusCode::kNoHalfPrecision) { throw; } } } // Fills the cache with all binaries for a specific device // TODO: Add half-precision FP16 set-up calls StatusCode FillCache(const cl_device_id device) { try { // Creates a sample context and queue to match the normal routine calling conventions auto device_cpp = Device(device); auto context = Context(device_cpp); auto queue = Queue(context, device_cpp); FillCacheForPrecision(queue); FillCacheForPrecision(queue); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } // ================================================================================================= // Overrides the tuning parameters for this device-precision-kernel combination StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name, const Precision precision, const std::unordered_map ¶meters) { try { // Retrieves the device name const auto device_cpp = Device(device); const auto device_name = device_cpp.Name(); // Retrieves the current database values to verify whether the new ones are complete auto in_cache = false; const auto current_database = DatabaseCache::Instance().Get(DatabaseKeyRef{ precision, device_name, kernel_name }, &in_cache); if (!in_cache) { return StatusCode::kInvalidOverrideKernel; } for (const auto ¤t_param : current_database.GetParameterNames()) { if (parameters.find(current_param) == parameters.end()) { return StatusCode::kMissingOverrideParameter; } } // Clears the existing program & binary cache for routines with the target kernel const auto routine_names = Routine::routines_by_kernel.at(kernel_name); for (const auto &routine_name : routine_names) { ProgramCache::Instance().RemoveBySubset<1, 2>(ProgramKey{nullptr, precision, routine_name}); BinaryCache::Instance().Remove(BinaryKey{precision, routine_name, device_name}); } // Creates a small custom database based on the provided parameters const auto database_device = Database::DatabaseDevice{"default", parameters}; const auto database_vendor = Database::DatabaseVendor{database::kDeviceTypeAll, "default", {database_device}}; const auto database_entry = Database::DatabaseEntry{kernel_name, precision, {database_vendor}}; const auto database = Database(device_cpp, kernel_name, precision, {&database_entry}); // Removes the old database entry and stores the new one in the cache DatabaseCache::Instance().Remove(DatabaseKey{ precision, device_name, kernel_name }); DatabaseCache::Instance().Store(DatabaseKey{ precision, device_name, kernel_name }, Database(database)); } catch (...) { return DispatchException(); } return StatusCode::kSuccess; } // ================================================================================================= } // namespace clblast