diff options
-rw-r--r-- | CHANGELOG | 2 | ||||
-rw-r--r-- | doc/clblast.md | 33 | ||||
-rw-r--r-- | include/clblast.h | 10 | ||||
-rw-r--r-- | include/clblast_c.h | 43 | ||||
-rw-r--r-- | scripts/generator/generator.py | 22 | ||||
-rw-r--r-- | src/clblast.cc | 43 | ||||
-rw-r--r-- | src/clblast_c.cc | 80 | ||||
-rw-r--r-- | src/routines/level2/xger.cc | 2 | ||||
-rw-r--r-- | src/routines/level2/xher.cc | 3 | ||||
-rw-r--r-- | src/routines/level2/xher2.cc | 2 | ||||
-rw-r--r-- | src/routines/level2/xspr.cc | 1 | ||||
-rw-r--r-- | src/routines/level2/xspr2.cc | 1 | ||||
-rw-r--r-- | src/routines/level2/xsyr.cc | 1 | ||||
-rw-r--r-- | src/routines/level2/xsyr2.cc | 1 |
14 files changed, 217 insertions, 27 deletions
@@ -3,7 +3,7 @@ Development version (next release) - Added support for half-precision floating-point (fp16) in the library - Added half-precision routines: * Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN - * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV + * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2 Version 0.7.1 - Improved performance of large power-of-2 xGEMM kernels for AMD GPUs diff --git a/doc/clblast.md b/doc/clblast.md index 91efd5fd..6f3f09c2 100644 --- a/doc/clblast.md +++ b/doc/clblast.md @@ -1446,6 +1446,13 @@ StatusCode CLBlastDger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to GER: @@ -1814,6 +1821,12 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYR: @@ -1863,6 +1876,12 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) ``` Arguments to SPR: @@ -1914,6 +1933,13 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) ``` Arguments to SYR2: @@ -1969,6 +1995,13 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event) +StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) ``` Arguments to SPR2: diff --git a/include/clblast.h b/include/clblast.h index f0742614..d7b952ba 100644 --- a/include/clblast.h +++ b/include/clblast.h @@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_ cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event = nullptr); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER template <typename T> StatusCode Ger(const Layout layout, const size_t m, const size_t n, @@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template <typename T> StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, @@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template <typename T> StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, @@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template <typename T> StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, @@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event = nullptr); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template <typename T> StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, diff --git a/include/clblast_c.h b/include/clblast_c.h index d0b89e19..92392921 100644 --- a/include/clblast_c.h +++ b/include/clblast_c.h @@ -781,7 +781,7 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle, cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_command_queue* queue, cl_event* event); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER StatusCode PUBLIC_API CLBlastSger(const Layout layout, const size_t m, const size_t n, const float alpha, @@ -796,6 +796,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); // General rank-1 complex matrix update: CGERU/ZGERU StatusCode PUBLIC_API CLBlastCgeru(const Layout layout, @@ -889,7 +896,7 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -902,8 +909,14 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -916,8 +929,14 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle, const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -932,8 +951,15 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem a_buffer, const size_t a_offset, const size_t a_ld, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle, const size_t n, const float alpha, @@ -948,6 +974,13 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle, const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, cl_mem ap_buffer, const size_t ap_offset, cl_command_queue* queue, cl_event* event); +StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 6881949b..8dd1f77a 100644 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -96,17 +96,17 @@ routines = [ Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []), Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []), # Level 2: matrix update - Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []), - Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []), - Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []), - Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []), - Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []), - Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []), - Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []), - Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []), - Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []), - Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []), - Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []), + Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []), + Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []), + Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []), + Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []), + Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []), + Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []), + Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []), + Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []), + Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []), + Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []), + Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []), ], [ # Level 3: matrix-matrix Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []), diff --git a/src/clblast.cc b/src/clblast.cc index e89b41e8..449c7321 100644 --- a/src/clblast.cc +++ b/src/clblast.cc @@ -1207,7 +1207,7 @@ template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); -// General rank-1 matrix update: SGER/DGER +// General rank-1 matrix update: SGER/DGER/HGER template <typename T> StatusCode Ger(const Layout layout, const size_t m, const size_t n, @@ -1241,6 +1241,13 @@ template StatusCode PUBLIC_API Ger<double>(const Layout, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Ger<half>(const Layout, + const size_t, const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); // General rank-1 complex matrix update: CGERU/ZGERU template <typename T> @@ -1444,7 +1451,7 @@ template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle, cl_mem, const size_t, cl_command_queue*, cl_event*); -// Symmetric rank-1 matrix update: SSYR/DSYR +// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR template <typename T> StatusCode Syr(const Layout layout, const Triangle triangle, const size_t n, @@ -1474,8 +1481,14 @@ template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric packed rank-1 matrix update: SSPR/DSPR +// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR template <typename T> StatusCode Spr(const Layout layout, const Triangle triangle, const size_t n, @@ -1505,8 +1518,14 @@ template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); -// Symmetric rank-2 matrix update: SSYR2/DSYR2 +// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2 template <typename T> StatusCode Syr2(const Layout layout, const Triangle triangle, const size_t n, @@ -1540,8 +1559,15 @@ template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, const size_t, + cl_command_queue*, cl_event*); -// Symmetric packed rank-2 matrix update: SSPR2/DSPR2 +// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2 template <typename T> StatusCode Spr2(const Layout layout, const Triangle triangle, const size_t n, @@ -1575,6 +1601,13 @@ template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle, const cl_mem, const size_t, const size_t, cl_mem, const size_t, cl_command_queue*, cl_event*); +template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle, + const size_t, + const half, + const cl_mem, const size_t, const size_t, + const cl_mem, const size_t, const size_t, + cl_mem, const size_t, + cl_command_queue*, cl_event*); // ================================================================================================= // BLAS level-3 (matrix-matrix) routines diff --git a/src/clblast_c.cc b/src/clblast_c.cc index f1a81be5..c368a03c 100644 --- a/src/clblast_c.cc +++ b/src/clblast_c.cc @@ -1702,6 +1702,22 @@ StatusCode CLBlastDger(const Layout layout, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHger(const Layout layout, + const size_t m, const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Ger(static_cast<clblast::Layout>(layout), + m, n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} // GERU StatusCode CLBlastCgeru(const Layout layout, @@ -1938,6 +1954,21 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} // SPR StatusCode CLBlastSspr(const Layout layout, const Triangle triangle, @@ -1970,6 +2001,21 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHspr(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} // SYR2 StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle, @@ -2006,6 +2052,23 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem a_buffer, const size_t a_offset, const size_t a_ld, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Syr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + a_buffer, a_offset, a_ld, + queue, event); + return static_cast<StatusCode>(status); +} // SPR2 StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle, @@ -2042,6 +2105,23 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle, queue, event); return static_cast<StatusCode>(status); } +StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle, + const size_t n, + const cl_half alpha, + const cl_mem x_buffer, const size_t x_offset, const size_t x_inc, + const cl_mem y_buffer, const size_t y_offset, const size_t y_inc, + cl_mem ap_buffer, const size_t ap_offset, + cl_command_queue* queue, cl_event* event) { + auto status = clblast::Spr2(static_cast<clblast::Layout>(layout), + static_cast<clblast::Triangle>(triangle), + n, + alpha, + x_buffer, x_offset, x_inc, + y_buffer, y_offset, y_inc, + ap_buffer, ap_offset, + queue, event); + return static_cast<StatusCode>(status); +} // ================================================================================================= // BLAS level-3 (matrix-matrix) routines diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc index 47d7abe2..d1f98990 100644 --- a/src/routines/level2/xger.cc +++ b/src/routines/level2/xger.cc @@ -20,6 +20,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xger<half>::precision_ = Precision::kHalf; template <> const Precision Xger<float>::precision_ = Precision::kSingle; template <> const Precision Xger<double>::precision_ = Precision::kDouble; template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle; @@ -104,6 +105,7 @@ StatusCode Xger<T>::DoGer(const Layout layout, // ================================================================================================= // Compiles the templated class +template class Xger<half>; template class Xger<float>; template class Xger<double>; template class Xger<float2>; diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc index 852e3f15..73e7a47d 100644 --- a/src/routines/level2/xher.cc +++ b/src/routines/level2/xher.cc @@ -19,6 +19,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xher<half, half>::precision_ = Precision::kHalf; template <> const Precision Xher<float, float>::precision_ = Precision::kSingle; template <> const Precision Xher<double, double>::precision_ = Precision::kDouble; template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle; @@ -43,6 +44,7 @@ template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return floa template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; } template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; } template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; } +template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; } // ================================================================================================= @@ -114,6 +116,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xher<half, half>; template class Xher<float, float>; template class Xher<double, double>; template class Xher<float2, float>; diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc index 82052187..a73dde52 100644 --- a/src/routines/level2/xher2.cc +++ b/src/routines/level2/xher2.cc @@ -19,6 +19,7 @@ namespace clblast { // ================================================================================================= // Specific implementations to get the memory-type based on a template argument +template <> const Precision Xher2<half>::precision_ = Precision::kHalf; template <> const Precision Xher2<float>::precision_ = Precision::kSingle; template <> const Precision Xher2<double>::precision_ = Precision::kDouble; template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle; @@ -106,6 +107,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xher2<half>; template class Xher2<float>; template class Xher2<double>; template class Xher2<float2>; diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc index 55af2f29..c556b920 100644 --- a/src/routines/level2/xspr.cc +++ b/src/routines/level2/xspr.cc @@ -44,6 +44,7 @@ StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xspr<half>; template class Xspr<float>; template class Xspr<double>; diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc index 9a3f97ce..c4ad5dc4 100644 --- a/src/routines/level2/xspr2.cc +++ b/src/routines/level2/xspr2.cc @@ -46,6 +46,7 @@ StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xspr2<half>; template class Xspr2<float>; template class Xspr2<double>; diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc index 4b3928e5..892517d7 100644 --- a/src/routines/level2/xsyr.cc +++ b/src/routines/level2/xsyr.cc @@ -43,6 +43,7 @@ StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xsyr<half>; template class Xsyr<float>; template class Xsyr<double>; diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc index 3ae389e0..e6dfd158 100644 --- a/src/routines/level2/xsyr2.cc +++ b/src/routines/level2/xsyr2.cc @@ -45,6 +45,7 @@ StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle, // ================================================================================================= // Compiles the templated class +template class Xsyr2<half>; template class Xsyr2<float>; template class Xsyr2<double>; |