summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG2
-rw-r--r--doc/clblast.md33
-rw-r--r--include/clblast.h10
-rw-r--r--include/clblast_c.h43
-rw-r--r--scripts/generator/generator.py22
-rw-r--r--src/clblast.cc43
-rw-r--r--src/clblast_c.cc80
-rw-r--r--src/routines/level2/xger.cc2
-rw-r--r--src/routines/level2/xher.cc3
-rw-r--r--src/routines/level2/xher2.cc2
-rw-r--r--src/routines/level2/xspr.cc1
-rw-r--r--src/routines/level2/xspr2.cc1
-rw-r--r--src/routines/level2/xsyr.cc1
-rw-r--r--src/routines/level2/xsyr2.cc1
14 files changed, 217 insertions, 27 deletions
diff --git a/CHANGELOG b/CHANGELOG
index aba3d3f7..328f044a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,7 +3,7 @@ Development version (next release)
- Added support for half-precision floating-point (fp16) in the library
- Added half-precision routines:
* Level-1: HSWAP/HSCAL/HCOPY/HAXPY/HDOT/HNRM2/HASUM/HSUM/iHAMAX/iHMAX/iHMIN
- * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV
+ * Level-2: HGEMV/HGBMV/HHEMV/HHBMV/HHPMV/HSYMV/HSBMV/HSPMV/HTRMV/HTBMV/HTPMV/HGER/HSYR/HSPR/HSYR2/HSPR2
Version 0.7.1
- Improved performance of large power-of-2 xGEMM kernels for AMD GPUs
diff --git a/doc/clblast.md b/doc/clblast.md
index 91efd5fd..6f3f09c2 100644
--- a/doc/clblast.md
+++ b/doc/clblast.md
@@ -1446,6 +1446,13 @@ StatusCode CLBlastDger(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHger(const Layout layout,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to GER:
@@ -1814,6 +1821,12 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYR:
@@ -1863,6 +1876,12 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SPR:
@@ -1914,6 +1933,13 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SYR2:
@@ -1969,6 +1995,13 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event)
+StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event)
```
Arguments to SPR2:
diff --git a/include/clblast.h b/include/clblast.h
index f0742614..d7b952ba 100644
--- a/include/clblast.h
+++ b/include/clblast.h
@@ -356,7 +356,7 @@ StatusCode Tpsv(const Layout layout, const Triangle triangle, const Transpose a_
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event = nullptr);
-// General rank-1 matrix update: SGER/DGER
+// General rank-1 matrix update: SGER/DGER/HGER
template <typename T>
StatusCode Ger(const Layout layout,
const size_t m, const size_t n,
@@ -424,7 +424,7 @@ StatusCode Hpr2(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event = nullptr);
-// Symmetric rank-1 matrix update: SSYR/DSYR
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
template <typename T>
StatusCode Syr(const Layout layout, const Triangle triangle,
const size_t n,
@@ -433,7 +433,7 @@ StatusCode Syr(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event = nullptr);
-// Symmetric packed rank-1 matrix update: SSPR/DSPR
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
template <typename T>
StatusCode Spr(const Layout layout, const Triangle triangle,
const size_t n,
@@ -442,7 +442,7 @@ StatusCode Spr(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event = nullptr);
-// Symmetric rank-2 matrix update: SSYR2/DSYR2
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
template <typename T>
StatusCode Syr2(const Layout layout, const Triangle triangle,
const size_t n,
@@ -452,7 +452,7 @@ StatusCode Syr2(const Layout layout, const Triangle triangle,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event = nullptr);
-// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
template <typename T>
StatusCode Spr2(const Layout layout, const Triangle triangle,
const size_t n,
diff --git a/include/clblast_c.h b/include/clblast_c.h
index d0b89e19..92392921 100644
--- a/include/clblast_c.h
+++ b/include/clblast_c.h
@@ -781,7 +781,7 @@ StatusCode PUBLIC_API CLBlastZtpsv(const Layout layout, const Triangle triangle,
cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_command_queue* queue, cl_event* event);
-// General rank-1 matrix update: SGER/DGER
+// General rank-1 matrix update: SGER/DGER/HGER
StatusCode PUBLIC_API CLBlastSger(const Layout layout,
const size_t m, const size_t n,
const float alpha,
@@ -796,6 +796,13 @@ StatusCode PUBLIC_API CLBlastDger(const Layout layout,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHger(const Layout layout,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
// General rank-1 complex matrix update: CGERU/ZGERU
StatusCode PUBLIC_API CLBlastCgeru(const Layout layout,
@@ -889,7 +896,7 @@ StatusCode PUBLIC_API CLBlastZhpr2(const Layout layout, const Triangle triangle,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
-// Symmetric rank-1 matrix update: SSYR/DSYR
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
StatusCode PUBLIC_API CLBlastSsyr(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@@ -902,8 +909,14 @@ StatusCode PUBLIC_API CLBlastDsyr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
-// Symmetric packed rank-1 matrix update: SSPR/DSPR
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
StatusCode PUBLIC_API CLBlastSspr(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@@ -916,8 +929,14 @@ StatusCode PUBLIC_API CLBlastDspr(const Layout layout, const Triangle triangle,
const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHspr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
-// Symmetric rank-2 matrix update: SSYR2/DSYR2
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
StatusCode PUBLIC_API CLBlastSsyr2(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@@ -932,8 +951,15 @@ StatusCode PUBLIC_API CLBlastDsyr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHsyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event);
-// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
StatusCode PUBLIC_API CLBlastSspr2(const Layout layout, const Triangle triangle,
const size_t n,
const float alpha,
@@ -948,6 +974,13 @@ StatusCode PUBLIC_API CLBlastDspr2(const Layout layout, const Triangle triangle,
const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
cl_mem ap_buffer, const size_t ap_offset,
cl_command_queue* queue, cl_event* event);
+StatusCode PUBLIC_API CLBlastHspr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 6881949b..8dd1f77a 100644
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -96,17 +96,17 @@ routines = [
Routine(False, True, "2a", "tbsv", T, [S,D,C,Z], ["n","k"], ["layout","triangle","a_transpose","diagonal"], ["a"], ["x"], [], "", "Solves a banded triangular system of equations", "", []),
Routine(False, True, "2a", "tpsv", T, [S,D,C,Z], ["n"], ["layout","triangle","a_transpose","diagonal"], ["ap"], ["x"], [], "", "Solves a packed triangular system of equations", "", []),
# Level 2: matrix update
- Routine(True, True, "2b", "ger", T, [S,D], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
- Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
- Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
- Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
- Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
- Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
- Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
- Routine(True, True, "2b", "syr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
- Routine(True, True, "2b", "spr", T, [S,D], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
- Routine(True, True, "2b", "syr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
- Routine(True, True, "2b", "spr2", T, [S,D], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
+ Routine(True, True, "2b", "ger", T, [S,D,H], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 matrix update", "", []),
+ Routine(True, True, "2b", "geru", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex matrix update", "", []),
+ Routine(True, True, "2b", "gerc", T, [C,Z], ["m","n"], ["layout"], ["x","y"], ["a"], ["alpha"], "", "General rank-1 complex conjugated matrix update", "", []),
+ Routine(True, True, "2b", "her", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Hermitian rank-1 matrix update", "", []),
+ Routine(True, True, "2b", "hpr", Tc, [Css,Zdd], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Hermitian packed rank-1 matrix update", "", []),
+ Routine(True, True, "2b", "her2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Hermitian rank-2 matrix update", "", []),
+ Routine(True, True, "2b", "hpr2", T, [C,Z], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Hermitian packed rank-2 matrix update", "", []),
+ Routine(True, True, "2b", "syr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["a"], ["alpha"], "", "Symmetric rank-1 matrix update", "", []),
+ Routine(True, True, "2b", "spr", T, [S,D,H], ["n"], ["layout","triangle"], ["x"], ["ap"], ["alpha"], "", "Symmetric packed rank-1 matrix update", "", []),
+ Routine(True, True, "2b", "syr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["a"], ["alpha"], "", "Symmetric rank-2 matrix update", "", []),
+ Routine(True, True, "2b", "spr2", T, [S,D,H], ["n"], ["layout","triangle"], ["x","y"], ["ap"], ["alpha"], "", "Symmetric packed rank-2 matrix update", "", []),
],
[ # Level 3: matrix-matrix
Routine(True, True, "3", "gemm", T, [S,D,C,Z], ["m","n","k"], ["layout","a_transpose","b_transpose"], ["a","b"], ["c"], ["alpha","beta"], "", "General matrix-matrix multiplication", "", []),
diff --git a/src/clblast.cc b/src/clblast.cc
index e89b41e8..449c7321 100644
--- a/src/clblast.cc
+++ b/src/clblast.cc
@@ -1207,7 +1207,7 @@ template StatusCode PUBLIC_API Tpsv<double2>(const Layout, const Triangle, const
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
-// General rank-1 matrix update: SGER/DGER
+// General rank-1 matrix update: SGER/DGER/HGER
template <typename T>
StatusCode Ger(const Layout layout,
const size_t m, const size_t n,
@@ -1241,6 +1241,13 @@ template StatusCode PUBLIC_API Ger<double>(const Layout,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Ger<half>(const Layout,
+ const size_t, const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
// General rank-1 complex matrix update: CGERU/ZGERU
template <typename T>
@@ -1444,7 +1451,7 @@ template StatusCode PUBLIC_API Hpr2<double2>(const Layout, const Triangle,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
-// Symmetric rank-1 matrix update: SSYR/DSYR
+// Symmetric rank-1 matrix update: SSYR/DSYR/HSYR
template <typename T>
StatusCode Syr(const Layout layout, const Triangle triangle,
const size_t n,
@@ -1474,8 +1481,14 @@ template StatusCode PUBLIC_API Syr<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Syr<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
-// Symmetric packed rank-1 matrix update: SSPR/DSPR
+// Symmetric packed rank-1 matrix update: SSPR/DSPR/HSPR
template <typename T>
StatusCode Spr(const Layout layout, const Triangle triangle,
const size_t n,
@@ -1505,8 +1518,14 @@ template StatusCode PUBLIC_API Spr<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Spr<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t,
+ cl_command_queue*, cl_event*);
-// Symmetric rank-2 matrix update: SSYR2/DSYR2
+// Symmetric rank-2 matrix update: SSYR2/DSYR2/HSYR2
template <typename T>
StatusCode Syr2(const Layout layout, const Triangle triangle,
const size_t n,
@@ -1540,8 +1559,15 @@ template StatusCode PUBLIC_API Syr2<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t, const size_t,
cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Syr2<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t, const size_t,
+ cl_command_queue*, cl_event*);
-// Symmetric packed rank-2 matrix update: SSPR2/DSPR2
+// Symmetric packed rank-2 matrix update: SSPR2/DSPR2/HSPR2
template <typename T>
StatusCode Spr2(const Layout layout, const Triangle triangle,
const size_t n,
@@ -1575,6 +1601,13 @@ template StatusCode PUBLIC_API Spr2<double>(const Layout, const Triangle,
const cl_mem, const size_t, const size_t,
cl_mem, const size_t,
cl_command_queue*, cl_event*);
+template StatusCode PUBLIC_API Spr2<half>(const Layout, const Triangle,
+ const size_t,
+ const half,
+ const cl_mem, const size_t, const size_t,
+ const cl_mem, const size_t, const size_t,
+ cl_mem, const size_t,
+ cl_command_queue*, cl_event*);
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
diff --git a/src/clblast_c.cc b/src/clblast_c.cc
index f1a81be5..c368a03c 100644
--- a/src/clblast_c.cc
+++ b/src/clblast_c.cc
@@ -1702,6 +1702,22 @@ StatusCode CLBlastDger(const Layout layout,
queue, event);
return static_cast<StatusCode>(status);
}
+StatusCode CLBlastHger(const Layout layout,
+ const size_t m, const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Ger(static_cast<clblast::Layout>(layout),
+ m, n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
// GERU
StatusCode CLBlastCgeru(const Layout layout,
@@ -1938,6 +1954,21 @@ StatusCode CLBlastDsyr(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
+StatusCode CLBlastHsyr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Syr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
// SPR
StatusCode CLBlastSspr(const Layout layout, const Triangle triangle,
@@ -1970,6 +2001,21 @@ StatusCode CLBlastDspr(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
+StatusCode CLBlastHspr(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Spr(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ ap_buffer, ap_offset,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
// SYR2
StatusCode CLBlastSsyr2(const Layout layout, const Triangle triangle,
@@ -2006,6 +2052,23 @@ StatusCode CLBlastDsyr2(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
+StatusCode CLBlastHsyr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Syr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ a_buffer, a_offset, a_ld,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
// SPR2
StatusCode CLBlastSspr2(const Layout layout, const Triangle triangle,
@@ -2042,6 +2105,23 @@ StatusCode CLBlastDspr2(const Layout layout, const Triangle triangle,
queue, event);
return static_cast<StatusCode>(status);
}
+StatusCode CLBlastHspr2(const Layout layout, const Triangle triangle,
+ const size_t n,
+ const cl_half alpha,
+ const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+ const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+ cl_mem ap_buffer, const size_t ap_offset,
+ cl_command_queue* queue, cl_event* event) {
+ auto status = clblast::Spr2(static_cast<clblast::Layout>(layout),
+ static_cast<clblast::Triangle>(triangle),
+ n,
+ alpha,
+ x_buffer, x_offset, x_inc,
+ y_buffer, y_offset, y_inc,
+ ap_buffer, ap_offset,
+ queue, event);
+ return static_cast<StatusCode>(status);
+}
// =================================================================================================
// BLAS level-3 (matrix-matrix) routines
diff --git a/src/routines/level2/xger.cc b/src/routines/level2/xger.cc
index 47d7abe2..d1f98990 100644
--- a/src/routines/level2/xger.cc
+++ b/src/routines/level2/xger.cc
@@ -20,6 +20,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xger<half>::precision_ = Precision::kHalf;
template <> const Precision Xger<float>::precision_ = Precision::kSingle;
template <> const Precision Xger<double>::precision_ = Precision::kDouble;
template <> const Precision Xger<float2>::precision_ = Precision::kComplexSingle;
@@ -104,6 +105,7 @@ StatusCode Xger<T>::DoGer(const Layout layout,
// =================================================================================================
// Compiles the templated class
+template class Xger<half>;
template class Xger<float>;
template class Xger<double>;
template class Xger<float2>;
diff --git a/src/routines/level2/xher.cc b/src/routines/level2/xher.cc
index 852e3f15..73e7a47d 100644
--- a/src/routines/level2/xher.cc
+++ b/src/routines/level2/xher.cc
@@ -19,6 +19,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher<half, half>::precision_ = Precision::kHalf;
template <> const Precision Xher<float, float>::precision_ = Precision::kSingle;
template <> const Precision Xher<double, double>::precision_ = Precision::kDouble;
template <> const Precision Xher<float2, float>::precision_ = Precision::kComplexSingle;
@@ -43,6 +44,7 @@ template <> float2 Xher<float2,float>::GetAlpha(const float alpha) { return floa
template <> double2 Xher<double2,double>::GetAlpha(const double alpha) { return double2{alpha, 0.0}; }
template <> float Xher<float,float>::GetAlpha(const float alpha) { return alpha; }
template <> double Xher<double,double>::GetAlpha(const double alpha) { return alpha; }
+template <> half Xher<half,half>::GetAlpha(const half alpha) { return alpha; }
// =================================================================================================
@@ -114,6 +116,7 @@ StatusCode Xher<T,U>::DoHer(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
+template class Xher<half, half>;
template class Xher<float, float>;
template class Xher<double, double>;
template class Xher<float2, float>;
diff --git a/src/routines/level2/xher2.cc b/src/routines/level2/xher2.cc
index 82052187..a73dde52 100644
--- a/src/routines/level2/xher2.cc
+++ b/src/routines/level2/xher2.cc
@@ -19,6 +19,7 @@ namespace clblast {
// =================================================================================================
// Specific implementations to get the memory-type based on a template argument
+template <> const Precision Xher2<half>::precision_ = Precision::kHalf;
template <> const Precision Xher2<float>::precision_ = Precision::kSingle;
template <> const Precision Xher2<double>::precision_ = Precision::kDouble;
template <> const Precision Xher2<float2>::precision_ = Precision::kComplexSingle;
@@ -106,6 +107,7 @@ StatusCode Xher2<T>::DoHer2(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
+template class Xher2<half>;
template class Xher2<float>;
template class Xher2<double>;
template class Xher2<float2>;
diff --git a/src/routines/level2/xspr.cc b/src/routines/level2/xspr.cc
index 55af2f29..c556b920 100644
--- a/src/routines/level2/xspr.cc
+++ b/src/routines/level2/xspr.cc
@@ -44,6 +44,7 @@ StatusCode Xspr<T>::DoSpr(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
+template class Xspr<half>;
template class Xspr<float>;
template class Xspr<double>;
diff --git a/src/routines/level2/xspr2.cc b/src/routines/level2/xspr2.cc
index 9a3f97ce..c4ad5dc4 100644
--- a/src/routines/level2/xspr2.cc
+++ b/src/routines/level2/xspr2.cc
@@ -46,6 +46,7 @@ StatusCode Xspr2<T>::DoSpr2(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
+template class Xspr2<half>;
template class Xspr2<float>;
template class Xspr2<double>;
diff --git a/src/routines/level2/xsyr.cc b/src/routines/level2/xsyr.cc
index 4b3928e5..892517d7 100644
--- a/src/routines/level2/xsyr.cc
+++ b/src/routines/level2/xsyr.cc
@@ -43,6 +43,7 @@ StatusCode Xsyr<T>::DoSyr(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
+template class Xsyr<half>;
template class Xsyr<float>;
template class Xsyr<double>;
diff --git a/src/routines/level2/xsyr2.cc b/src/routines/level2/xsyr2.cc
index 3ae389e0..e6dfd158 100644
--- a/src/routines/level2/xsyr2.cc
+++ b/src/routines/level2/xsyr2.cc
@@ -45,6 +45,7 @@ StatusCode Xsyr2<T>::DoSyr2(const Layout layout, const Triangle triangle,
// =================================================================================================
// Compiles the templated class
+template class Xsyr2<half>;
template class Xsyr2<float>;
template class Xsyr2<double>;