11 files changed, 4391 insertions, 0 deletions
diff --git a/doc/api.md b/doc/api.md
new file mode 100644
index 00000000..996505f1
--- /dev/null
+++ b/doc/api.md
@@ -0,0 +1,3712 @@
+CLBlast: API reference
+================
+
+
+xSWAP: Swap two vectors
+-------------
+
+Interchanges _n_ elements of vectors _x_ and _y_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Swap(const size_t n,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSswap(const size_t n,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDswap(const size_t n,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCswap(const size_t n,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZswap(const size_t n,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHswap(const size_t n,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SWAP:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xSCAL: Vector scaling
+-------------
+
+Multiplies _n_ elements of vector _x_ by a scalar constant _alpha_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Scal(const size_t n,
+                const T alpha,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSscal(const size_t n,
+                               const float alpha,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDscal(const size_t n,
+                               const double alpha,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCscal(const size_t n,
+                               const cl_float2 alpha,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZscal(const size_t n,
+                               const cl_double2 alpha,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHscal(const size_t n,
+                               const cl_half alpha,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SCAL:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xCOPY: Vector copy
+-------------
+
+Copies the contents of vector _x_ into vector _y_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Copy(const size_t n,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastScopy(const size_t n,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDcopy(const size_t n,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCcopy(const size_t n,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZcopy(const size_t n,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHcopy(const size_t n,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to COPY:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xAXPY: Vector-times-constant plus vector
+-------------
+
+Performs the operation _y = alpha * x + y_, in which _x_ and _y_ are vectors and _alpha_ is a scalar constant.
+
+C++ API:
+```
+template <typename T>
+StatusCode Axpy(const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSaxpy(const size_t n,
+                               const float alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDaxpy(const size_t n,
+                               const double alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCaxpy(const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZaxpy(const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHaxpy(const size_t n,
+                               const cl_half alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to AXPY:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xDOT: Dot product of two vectors
+-------------
+
+Multiplies _n_ elements of the vectors _x_ and _y_ element-wise and accumulates the results. The sum is stored in the _dot_ buffer.
+
+C++ API:
+```
+template <typename T>
+StatusCode Dot(const size_t n,
+               cl_mem dot_buffer, const size_t dot_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSdot(const size_t n,
+                              cl_mem dot_buffer, const size_t dot_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDdot(const size_t n,
+                              cl_mem dot_buffer, const size_t dot_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHdot(const size_t n,
+                              cl_mem dot_buffer, const size_t dot_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to DOT:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
+* `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xDOTU: Dot product of two complex vectors
+-------------
+
+See the regular xDOT routine.
+
+C++ API:
+```
+template <typename T>
+StatusCode Dotu(const size_t n,
+                cl_mem dot_buffer, const size_t dot_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCdotu(const size_t n,
+                               cl_mem dot_buffer, const size_t dot_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZdotu(const size_t n,
+                               cl_mem dot_buffer, const size_t dot_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to DOTU:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
+* `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xDOTC: Dot product of two complex vectors, one conjugated
+-------------
+
+See the regular xDOT routine.
+
+C++ API:
+```
+template <typename T>
+StatusCode Dotc(const size_t n,
+                cl_mem dot_buffer, const size_t dot_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCdotc(const size_t n,
+                               cl_mem dot_buffer, const size_t dot_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZdotc(const size_t n,
+                               cl_mem dot_buffer, const size_t dot_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to DOTC:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem dot_buffer`: OpenCL buffer to store the output dot vector.
+* `const size_t dot_offset`: The offset in elements from the start of the output dot vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xNRM2: Euclidian norm of a vector
+-------------
+
+Accumulates the square of _n_ elements in the _x_ vector and takes the square root. The resulting L2 norm is stored in the _nrm2_ buffer.
+
+C++ API:
+```
+template <typename T>
+StatusCode Nrm2(const size_t n,
+                cl_mem nrm2_buffer, const size_t nrm2_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSnrm2(const size_t n,
+                               cl_mem nrm2_buffer, const size_t nrm2_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDnrm2(const size_t n,
+                               cl_mem nrm2_buffer, const size_t nrm2_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScnrm2(const size_t n,
+                               cl_mem nrm2_buffer, const size_t nrm2_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDznrm2(const size_t n,
+                               cl_mem nrm2_buffer, const size_t nrm2_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHnrm2(const size_t n,
+                               cl_mem nrm2_buffer, const size_t nrm2_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to NRM2:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem nrm2_buffer`: OpenCL buffer to store the output nrm2 vector.
+* `const size_t nrm2_offset`: The offset in elements from the start of the output nrm2 vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xASUM: Absolute sum of values in a vector
+-------------
+
+Accumulates the absolute value of _n_ elements in the _x_ vector. The results are stored in the _asum_ buffer.
+
+C++ API:
+```
+template <typename T>
+StatusCode Asum(const size_t n,
+                cl_mem asum_buffer, const size_t asum_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSasum(const size_t n,
+                               cl_mem asum_buffer, const size_t asum_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDasum(const size_t n,
+                               cl_mem asum_buffer, const size_t asum_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScasum(const size_t n,
+                               cl_mem asum_buffer, const size_t asum_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDzasum(const size_t n,
+                               cl_mem asum_buffer, const size_t asum_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHasum(const size_t n,
+                               cl_mem asum_buffer, const size_t asum_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to ASUM:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem asum_buffer`: OpenCL buffer to store the output asum vector.
+* `const size_t asum_offset`: The offset in elements from the start of the output asum vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xSUM: Sum of values in a vector (non-BLAS function)
+-------------
+
+Accumulates the values of _n_ elements in the _x_ vector. The results are stored in the _sum_ buffer. This routine is the non-absolute version of the xASUM BLAS routine.
+
+C++ API:
+```
+template <typename T>
+StatusCode Sum(const size_t n,
+               cl_mem sum_buffer, const size_t sum_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsum(const size_t n,
+                              cl_mem sum_buffer, const size_t sum_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsum(const size_t n,
+                              cl_mem sum_buffer, const size_t sum_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastScsum(const size_t n,
+                              cl_mem sum_buffer, const size_t sum_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDzsum(const size_t n,
+                              cl_mem sum_buffer, const size_t sum_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsum(const size_t n,
+                              cl_mem sum_buffer, const size_t sum_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SUM:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem sum_buffer`: OpenCL buffer to store the output sum vector.
+* `const size_t sum_offset`: The offset in elements from the start of the output sum vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xAMAX: Index of absolute maximum value in a vector
+-------------
+
+Finds the index of the maximum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer.
+
+C++ API:
+```
+template <typename T>
+StatusCode Amax(const size_t n,
+                cl_mem imax_buffer, const size_t imax_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastiSamax(const size_t n,
+                               cl_mem imax_buffer, const size_t imax_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDamax(const size_t n,
+                               cl_mem imax_buffer, const size_t imax_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCamax(const size_t n,
+                               cl_mem imax_buffer, const size_t imax_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZamax(const size_t n,
+                               cl_mem imax_buffer, const size_t imax_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHamax(const size_t n,
+                               cl_mem imax_buffer, const size_t imax_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to AMAX:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector.
+* `const size_t imax_offset`: The offset in elements from the start of the output imax vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xAMIN: Index of absolute minimum value in a vector (non-BLAS function)
+-------------
+
+Finds the index of the minimum of the absolute values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer.
+
+C++ API:
+```
+template <typename T>
+StatusCode Amin(const size_t n,
+                cl_mem imin_buffer, const size_t imin_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastiSamin(const size_t n,
+                               cl_mem imin_buffer, const size_t imin_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDamin(const size_t n,
+                               cl_mem imin_buffer, const size_t imin_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCamin(const size_t n,
+                               cl_mem imin_buffer, const size_t imin_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZamin(const size_t n,
+                               cl_mem imin_buffer, const size_t imin_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHamin(const size_t n,
+                               cl_mem imin_buffer, const size_t imin_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to AMIN:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector.
+* `const size_t imin_offset`: The offset in elements from the start of the output imin vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xMAX: Index of maximum value in a vector (non-BLAS function)
+-------------
+
+Finds the index of the maximum of the values in the _x_ vector. The resulting integer index is stored in the _imax_ buffer. This routine is the non-absolute version of the IxAMAX BLAS routine.
+
+C++ API:
+```
+template <typename T>
+StatusCode Max(const size_t n,
+               cl_mem imax_buffer, const size_t imax_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastiSmax(const size_t n,
+                              cl_mem imax_buffer, const size_t imax_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDmax(const size_t n,
+                              cl_mem imax_buffer, const size_t imax_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCmax(const size_t n,
+                              cl_mem imax_buffer, const size_t imax_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZmax(const size_t n,
+                              cl_mem imax_buffer, const size_t imax_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHmax(const size_t n,
+                              cl_mem imax_buffer, const size_t imax_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to MAX:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem imax_buffer`: OpenCL buffer to store the output imax vector.
+* `const size_t imax_offset`: The offset in elements from the start of the output imax vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xMIN: Index of minimum value in a vector (non-BLAS function)
+-------------
+
+Finds the index of the minimum of the values in the _x_ vector. The resulting integer index is stored in the _imin_ buffer. This routine is the non-absolute minimum version of the IxAMAX BLAS routine.
+
+C++ API:
+```
+template <typename T>
+StatusCode Min(const size_t n,
+               cl_mem imin_buffer, const size_t imin_offset,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastiSmin(const size_t n,
+                              cl_mem imin_buffer, const size_t imin_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiDmin(const size_t n,
+                              cl_mem imin_buffer, const size_t imin_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiCmin(const size_t n,
+                              cl_mem imin_buffer, const size_t imin_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiZmin(const size_t n,
+                              cl_mem imin_buffer, const size_t imin_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastiHmin(const size_t n,
+                              cl_mem imin_buffer, const size_t imin_offset,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to MIN:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `cl_mem imin_buffer`: OpenCL buffer to store the output imin vector.
+* `const size_t imin_offset`: The offset in elements from the start of the output imin vector.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xGEMV: General matrix-vector multiplication
+-------------
+
+Performs the operation _y = alpha * A * x + beta * y_, in which _x_ is an input vector, _y_ is an input and output vector, _A_ is an input matrix, and _alpha_ and _beta_ are scalars. The matrix _A_ can optionally be transposed before performing the operation.
+
+C++ API:
+```
+template <typename T>
+StatusCode Gemv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const float beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const double beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_float2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_double2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_half beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GEMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GEMV:
+
+* The value of `a_ld` must be at least `m`.
+
+
+
+xGBMV: General banded matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is banded instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Gbmv(const Layout layout, const Transpose a_transpose,
+                const size_t m, const size_t n, const size_t kl, const size_t ku,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n, const size_t kl, const size_t ku,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const float beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n, const size_t kl, const size_t ku,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const double beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n, const size_t kl, const size_t ku,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_float2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n, const size_t kl, const size_t ku,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_double2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgbmv(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                               const size_t m, const size_t n, const size_t kl, const size_t ku,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_half beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GBMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t kl`: Integer size argument. This value must be positive.
+* `const size_t ku`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GBMV:
+
+* The value of `a_ld` must be at least `kl + ku + 1`.
+
+
+
+xHEMV: Hermitian matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is an Hermitian matrix instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Hemv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastChemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_float2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhemv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_double2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HEMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HEMV:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xHBMV: Hermitian banded matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is an Hermitian banded matrix instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Hbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastChbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n, const size_t k,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_float2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n, const size_t k,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_double2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HBMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HBMV:
+
+* The value of `a_ld` must be at least `k + 1`.
+
+
+
+xHPMV: Hermitian packed matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Hpmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastChpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_float2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhpmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_double2 beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HPMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the input AP matrix.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xSYMV: Symmetric matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is symmetric instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Symv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const float beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const double beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsymv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_half beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SYMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SYMV:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xSBMV: Symmetric banded matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is symmetric and banded instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Sbmv(const Layout layout, const Triangle triangle,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n, const size_t k,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const float beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n, const size_t k,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const double beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsbmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n, const size_t k,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_half beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SBMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SBMV:
+
+* The value of `a_ld` must be at least `k + 1`.
+
+
+
+xSPMV: Symmetric packed matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Spmv(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const T beta,
+                cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const float alpha,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const float beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const double alpha,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const double beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHspmv(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_half alpha,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_half beta,
+                               cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SPMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the input AP matrix.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t y_offset`: The offset in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xTRMV: Triangular matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is triangular instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Trmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtrmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for TRMV:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xTBMV: Triangular banded matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is triangular and banded instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Tbmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n, const size_t k,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n, const size_t k,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n, const size_t k,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n, const size_t k,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n, const size_t k,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtbmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n, const size_t k,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TBMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for TBMV:
+
+* The value of `a_ld` must be at least `k + 1`.
+
+
+
+xTPMV: Triangular packed matrix-vector multiplication
+-------------
+
+Same operation as xGEMV, but matrix _A_ is a triangular packed matrix instead and repreented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Tpmv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem ap_buffer, const size_t ap_offset,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtpmv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem ap_buffer, const size_t ap_offset,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TPMV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem ap_buffer`: OpenCL buffer to store the input AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the input AP matrix.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xTRSV: Solves a triangular system of equations
+-------------
+
+
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsv(const Layout layout, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t n,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsv(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t n,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSV:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem x_buffer`: OpenCL buffer to store the output x vector.
+* `const size_t x_offset`: The offset in elements from the start of the output x vector.
+* `const size_t x_inc`: Stride/increment of the output x vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xGER: General rank-1 matrix update
+-------------
+
+Performs the operation _A = alpha * x * y^T + A_, in which _x_ is an input vector, _y^T_ is the transpose of the input vector _y_, _A_ is the matrix to be updated, and _alpha_ is a scalar value.
+
+C++ API:
+```
+template <typename T>
+StatusCode Ger(const Layout layout,
+               const size_t m, const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSger(const CLBlastLayout layout,
+                              const size_t m, const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDger(const CLBlastLayout layout,
+                              const size_t m, const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHger(const CLBlastLayout layout,
+                              const size_t m, const size_t n,
+                              const cl_half alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GER:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GER:
+
+* The value of `a_ld` must be at least `m`.
+
+
+
+xGERU: General rank-1 complex matrix update
+-------------
+
+Same operation as xGER, but with complex data-types.
+
+C++ API:
+```
+template <typename T>
+StatusCode Geru(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCgeru(const CLBlastLayout layout,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgeru(const CLBlastLayout layout,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GERU:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GERU:
+
+* The value of `a_ld` must be at least `m`.
+
+
+
+xGERC: General rank-1 complex conjugated matrix update
+-------------
+
+Same operation as xGERU, but the update is done based on the complex conjugate of the input vectors.
+
+C++ API:
+```
+template <typename T>
+StatusCode Gerc(const Layout layout,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCgerc(const CLBlastLayout layout,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgerc(const CLBlastLayout layout,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GERC:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GERC:
+
+* The value of `a_ld` must be at least `m`.
+
+
+
+xHER: Hermitian rank-1 matrix update
+-------------
+
+Performs the operation _A = alpha * x * x^T + A_, in which x is an input vector, x^T is the transpose of this vector, _A_ is the triangular Hermetian matrix to be updated, and alpha is a scalar value.
+
+C++ API:
+```
+template <typename T>
+StatusCode Her(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZher(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HER:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HER:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xHPR: Hermitian packed rank-1 matrix update
+-------------
+
+Same operation as xHER, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Hpr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem ap_buffer, const size_t ap_offset,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastChpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem ap_buffer, const size_t ap_offset,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhpr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem ap_buffer, const size_t ap_offset,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HPR:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xHER2: Hermitian rank-2 matrix update
+-------------
+
+Performs the operation _A = alpha * x * y^T + conj(alpha) * y * x^T + A_, in which _x_ is an input vector and _x^T_ its transpose, _y_ is an input vector and _y^T_ its transpose, _A_ is the triangular Hermetian matrix to be updated, _alpha_ is a scalar value and _conj(alpha)_ its complex conjugate.
+
+C++ API:
+```
+template <typename T>
+StatusCode Her2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZher2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HER2:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HER2:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xHPR2: Hermitian packed rank-2 matrix update
+-------------
+
+Same operation as xHER2, but matrix _A_ is an Hermitian packed matrix instead and represented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Hpr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem ap_buffer, const size_t ap_offset,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastChpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem ap_buffer, const size_t ap_offset,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhpr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem ap_buffer, const size_t ap_offset,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HPR2:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xSYR: Symmetric rank-1 matrix update
+-------------
+
+Same operation as xHER, but matrix A is a symmetric matrix instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Syr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const cl_half alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SYR:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SYR:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xSPR: Symmetric packed rank-1 matrix update
+-------------
+
+Same operation as xSPR, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Spr(const Layout layout, const Triangle triangle,
+               const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               cl_mem ap_buffer, const size_t ap_offset,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem ap_buffer, const size_t ap_offset,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem ap_buffer, const size_t ap_offset,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHspr(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                              const size_t n,
+                              const cl_half alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              cl_mem ap_buffer, const size_t ap_offset,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SPR:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xSYR2: Symmetric rank-2 matrix update
+-------------
+
+Same operation as xHER2, but matrix _A_ is a symmetric matrix instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Syr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const float alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const double alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_half alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SYR2:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem a_buffer`: OpenCL buffer to store the output A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the output A matrix.
+* `const size_t a_ld`: Leading dimension of the output A matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SYR2:
+
+* The value of `a_ld` must be at least `n`.
+
+
+
+xSPR2: Symmetric packed rank-2 matrix update
+-------------
+
+Same operation as xSPR2, but matrix _A_ is a symmetric packed matrix instead and represented as _AP_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Spr2(const Layout layout, const Triangle triangle,
+                const size_t n,
+                const T alpha,
+                const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                cl_mem ap_buffer, const size_t ap_offset,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const float alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem ap_buffer, const size_t ap_offset,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const double alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem ap_buffer, const size_t ap_offset,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHspr2(const CLBlastLayout layout, const CLBlastTriangle triangle,
+                               const size_t n,
+                               const cl_half alpha,
+                               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                               cl_mem ap_buffer, const size_t ap_offset,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SPR2:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `cl_mem ap_buffer`: OpenCL buffer to store the output AP matrix.
+* `const size_t ap_offset`: The offset in elements from the start of the output AP matrix.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xGEMM: General matrix-matrix multiplication
+-------------
+
+Performs the matrix product _C = alpha * A * B + beta * C_, in which _A_ (_m_ by _k_) and _B_ (_k_ by _n_) are two general rectangular input matrices, _C_ (_m_ by _n_) is the matrix to be updated, and _alpha_ and _beta_ are scalar values. The matrices _A_ and/or _B_ can optionally be transposed before performing the operation.
+
+C++ API:
+```
+template <typename T>
+StatusCode Gemm(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                const size_t m, const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event,
+                cl_mem temp_buffer = nullptr)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const float beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const double beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_float2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_double2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemm(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_half beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GEMM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GEMM:
+
+* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
+* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+xSYMM: Symmetric matrix-matrix multiplication
+-------------
+
+Same operation as xGEMM, but _A_ is symmetric instead. In case of `side == kLeft`, _A_ is a symmetric _m_ by _m_ matrix and _C = alpha * A * B + beta * C_ is performed. Otherwise, in case of `side == kRight`, _A_ is a symmtric _n_ by _n_ matrix and _C = alpha * B * A + beta * C_ is performed.
+
+C++ API:
+```
+template <typename T>
+StatusCode Symm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const float beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const double beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_float2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_double2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsymm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_half beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SYMM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SYMM:
+
+* When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`.
+* The value of `b_ld` must be at least `m`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+xHEMM: Hermitian matrix-matrix multiplication
+-------------
+
+Same operation as xSYMM, but _A_ is an Hermitian matrix instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Hemm(const Layout layout, const Side side, const Triangle triangle,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastChemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_float2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhemm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               const cl_double2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HEMM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HEMM:
+
+* When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`.
+* The value of `b_ld` must be at least `m`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+xSYRK: Rank-K update of a symmetric matrix
+-------------
+
+Performs the matrix product _C = alpha * A * A^T + beta * C_ or _C = alpha * A^T * A + beta * C_, in which _A_ is a general matrix and _A^T_ is its transpose, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.
+
+C++ API:
+```
+template <typename T>
+StatusCode Syrk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const float beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const double beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_float2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_double2 beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyrk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const cl_half beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SYRK:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SYRK:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+xHERK: Rank-K update of a hermitian matrix
+-------------
+
+Same operation as xSYRK, but _C_ is an Hermitian matrix instead.
+
+C++ API:
+```
+template <typename T>
+StatusCode Herk(const Layout layout, const Triangle triangle, const Transpose a_transpose,
+                const size_t n, const size_t k,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                const T beta,
+                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const float beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZherk(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose,
+                               const size_t n, const size_t k,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               const double beta,
+                               cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HERK:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HERK:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+xSYR2K: Rank-2K update of a symmetric matrix
+-------------
+
+Performs the matrix product _C = alpha * A * B^T + alpha * B * A^T + beta * C_ or _C = alpha * A^T * B + alpha * B^T * A + beta * C_, in which _A_ and _B_ are general matrices and _A^T_ and _B^T_ are their transposed versions, _C_ (_n_ by _n_) is the symmetric matrix to be updated, and _alpha_ and _beta_ are scalar values.
+
+C++ API:
+```
+template <typename T>
+StatusCode Syr2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const T beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const float alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const float beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const double alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const double beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const cl_float2 alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const cl_float2 beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const cl_double2 alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const cl_double2 beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHsyr2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const cl_half alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const cl_half beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to SYR2K:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose ab_transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for SYR2K:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`.
+* The value of `c_ld` must be at least `n`.
+
+
+
+xHER2K: Rank-2K update of a hermitian matrix
+-------------
+
+Same operation as xSYR2K, but _C_ is an Hermitian matrix instead.
+
+C++ API:
+```
+template <typename T, typename U>
+StatusCode Her2k(const Layout layout, const Triangle triangle, const Transpose ab_transpose,
+                 const size_t n, const size_t k,
+                 const T alpha,
+                 const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                 const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                 const U beta,
+                 cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                 cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastCher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const cl_float2 alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const float beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZher2k(const CLBlastLayout layout, const CLBlastTriangle triangle, const CLBlastTranspose ab_transpose,
+                                const size_t n, const size_t k,
+                                const cl_double2 alpha,
+                                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                const cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                const double beta,
+                                cl_mem c_buffer, const size_t c_offset, const size_t c_ld,
+                                cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HER2K:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose ab_transpose`: Transposing the packed input matrix AP, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const U beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for HER2K:
+
+* When `transpose == Transpose::kNo`, then `a_ld` must be at least `n`, otherwise `a_ld` must be at least `k`.
+* When `transpose == Transpose::kNo`, then `b_ld` must be at least `n`, otherwise `b_ld` must be at least `k`.
+* The value of `c_ld` must be at least `n`.
+
+
+
+xTRMM: Triangular matrix-matrix multiplication
+-------------
+
+Performs the matrix product _B = alpha * A * B_ or _B = alpha * B * A_, in which _A_ is a unit or non-unit triangular matrix, _B_ (_m_ by _n_) is the general matrix to be updated, and _alpha_ is a scalar value.
+
+C++ API:
+```
+template <typename T>
+StatusCode Trmm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHtrmm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_half alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRMM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for TRMM:
+
+* When `side = Side::kLeft` then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `n`.
+* The value of `b_ld` must be at least `m`.
+
+
+
+xTRSM: Solves a triangular system of equations
+-------------
+
+Solves the equation _A * X = alpha * B_ for the unknown _m_ by _n_ matrix X, in which _A_ is an _n_ by _n_ unit or non-unit triangular matrix and B is an _m_ by _n_ matrix. The matrix _B_ is overwritten by the solution _X_.
+
+C++ API:
+```
+template <typename T>
+StatusCode Trsm(const Layout layout, const Side side, const Triangle triangle, const Transpose a_transpose, const Diagonal diagonal,
+                const size_t m, const size_t n,
+                const T alpha,
+                const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastStrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const float alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const double alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_float2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZtrsm(const CLBlastLayout layout, const CLBlastSide side, const CLBlastTriangle triangle, const CLBlastTranspose a_transpose, const CLBlastDiagonal diagonal,
+                               const size_t m, const size_t n,
+                               const cl_double2 alpha,
+                               const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                               cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                               cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to TRSM:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Side side`: The position of the triangular matrix in the operation, either on the `Side::kLeft` (141) or `Side::kRight` (142).
+* `const Triangle triangle`: The part of the array of the triangular matrix to be used, either `Triangle::kUpper` (121) or `Triangle::kLower` (122).
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Diagonal diagonal`: The property of the diagonal matrix, either `Diagonal::kNonUnit` (131) for non-unit values on the diagonal or `Diagonal::kUnit` (132) for unit values on the diagonal.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xHAD: Element-wise vector product (Hadamard)
+-------------
+
+Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.
+
+C++ API:
+```
+template <typename T>
+StatusCode Had(const size_t n,
+               const T alpha,
+               const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+               const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+               const T beta,
+               cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+               cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastShad(const size_t n,
+                              const float alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const float beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDhad(const size_t n,
+                              const double alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const double beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastChad(const size_t n,
+                              const cl_float2 alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const cl_float2 beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZhad(const size_t n,
+                              const cl_double2 alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const cl_double2 beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHhad(const size_t n,
+                              const cl_half alpha,
+                              const cl_mem x_buffer, const size_t x_offset, const size_t x_inc,
+                              const cl_mem y_buffer, const size_t y_offset, const size_t y_inc,
+                              const cl_half beta,
+                              cl_mem z_buffer, const size_t z_offset, const size_t z_inc,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to HAD:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t x_offset`: The offset in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `const cl_mem y_buffer`: OpenCL buffer to store the input y vector.
+* `const size_t y_offset`: The offset in elements from the start of the input y vector.
+* `const size_t y_inc`: Stride/increment of the input y vector. This value must be greater than 0.
+* `const T beta`: Input scalar constant.
+* `cl_mem z_buffer`: OpenCL buffer to store the output z vector.
+* `const size_t z_offset`: The offset in elements from the start of the output z vector.
+* `const size_t z_inc`: Stride/increment of the output z vector. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xOMATCOPY: Scaling and out-place transpose/copy (non-BLAS function)
+-------------
+
+Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.
+
+C++ API:
+```
+template <typename T>
+StatusCode Omatcopy(const Layout layout, const Transpose a_transpose,
+                    const size_t m, const size_t n,
+                    const T alpha,
+                    const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                    cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                    cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                   const size_t m, const size_t n,
+                                   const float alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                   const size_t m, const size_t n,
+                                   const double alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastComatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                   const size_t m, const size_t n,
+                                   const cl_float2 alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                   const size_t m, const size_t n,
+                                   const cl_double2 alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHomatcopy(const CLBlastLayout layout, const CLBlastTranspose a_transpose,
+                                   const size_t m, const size_t n,
+                                   const cl_half alpha,
+                                   const cl_mem a_buffer, const size_t a_offset, const size_t a_ld,
+                                   cl_mem b_buffer, const size_t b_offset, const size_t b_ld,
+                                   cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to OMATCOPY:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `cl_mem b_buffer`: OpenCL buffer to store the output B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the output B matrix.
+* `const size_t b_ld`: Leading dimension of the output B matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for OMATCOPY:
+
+* The value of `a_ld` must be at least `m`.
+* The value of `b_ld` must be at least `n`.
+
+
+
+xIM2COL: Im2col function (non-BLAS function)
+-------------
+
+Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer
+
+C++ API:
+```
+template <typename T>
+StatusCode Im2col(const KernelMode kernel_mode,
+                  const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem im_buffer, const size_t im_offset,
+                  cl_mem col_buffer, const size_t col_offset,
+                  cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSim2col(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem im_buffer, const size_t im_offset,
+                                 cl_mem col_buffer, const size_t col_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDim2col(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem im_buffer, const size_t im_offset,
+                                 cl_mem col_buffer, const size_t col_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCim2col(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem im_buffer, const size_t im_offset,
+                                 cl_mem col_buffer, const size_t col_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZim2col(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem im_buffer, const size_t im_offset,
+                                 cl_mem col_buffer, const size_t col_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHim2col(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem im_buffer, const size_t im_offset,
+                                 cl_mem col_buffer, const size_t col_offset,
+                                 cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to IM2COL:
+
+* `const KernelMode kernel_mode`: The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes.
+* `const size_t channels`: Integer size argument. This value must be positive.
+* `const size_t height`: Integer size argument. This value must be positive.
+* `const size_t width`: Integer size argument. This value must be positive.
+* `const size_t kernel_h`: Integer size argument. This value must be positive.
+* `const size_t kernel_w`: Integer size argument. This value must be positive.
+* `const size_t pad_h`: Integer size argument. This value must be positive.
+* `const size_t pad_w`: Integer size argument. This value must be positive.
+* `const size_t stride_h`: Integer size argument. This value must be positive.
+* `const size_t stride_w`: Integer size argument. This value must be positive.
+* `const size_t dilation_h`: Integer size argument. This value must be positive.
+* `const size_t dilation_w`: Integer size argument. This value must be positive.
+* `const cl_mem im_buffer`: OpenCL buffer to store the input im tensor.
+* `const size_t im_offset`: The offset in elements from the start of the input im tensor.
+* `cl_mem col_buffer`: OpenCL buffer to store the output col tensor.
+* `const size_t col_offset`: The offset in elements from the start of the output col tensor.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xCOL2IM: Col2im function (non-BLAS function)
+-------------
+
+Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer.
+
+C++ API:
+```
+template <typename T>
+StatusCode Col2im(const KernelMode kernel_mode,
+                  const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                  const cl_mem col_buffer, const size_t col_offset,
+                  cl_mem im_buffer, const size_t im_offset,
+                  cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastScol2im(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDcol2im(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCcol2im(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZcol2im(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHcol2im(const CLBlastKernelMode kernel_mode,
+                                 const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+                                 const cl_mem col_buffer, const size_t col_offset,
+                                 cl_mem im_buffer, const size_t im_offset,
+                                 cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to COL2IM:
+
+* `const KernelMode kernel_mode`: The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes.
+* `const size_t channels`: Integer size argument. This value must be positive.
+* `const size_t height`: Integer size argument. This value must be positive.
+* `const size_t width`: Integer size argument. This value must be positive.
+* `const size_t kernel_h`: Integer size argument. This value must be positive.
+* `const size_t kernel_w`: Integer size argument. This value must be positive.
+* `const size_t pad_h`: Integer size argument. This value must be positive.
+* `const size_t pad_w`: Integer size argument. This value must be positive.
+* `const size_t stride_h`: Integer size argument. This value must be positive.
+* `const size_t stride_w`: Integer size argument. This value must be positive.
+* `const size_t dilation_h`: Integer size argument. This value must be positive.
+* `const size_t dilation_w`: Integer size argument. This value must be positive.
+* `const cl_mem col_buffer`: OpenCL buffer to store the input col tensor.
+* `const size_t col_offset`: The offset in elements from the start of the input col tensor.
+* `cl_mem im_buffer`: OpenCL buffer to store the output im tensor.
+* `const size_t im_offset`: The offset in elements from the start of the output im tensor.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xCONVGEMM: Batched convolution as GEMM (non-BLAS function)
+-------------
+
+Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).
+
+C++ API:
+```
+template <typename T>
+StatusCode Convgemm(const KernelMode kernel_mode,
+                    const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                    const cl_mem im_buffer, const size_t im_offset,
+                    const cl_mem kernel_buffer, const size_t kernel_offset,
+                    cl_mem result_buffer, const size_t result_offset,
+                    cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSconvgemm(const CLBlastKernelMode kernel_mode,
+                                   const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDconvgemm(const CLBlastKernelMode kernel_mode,
+                                   const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHconvgemm(const CLBlastKernelMode kernel_mode,
+                                   const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,
+                                   const cl_mem im_buffer, const size_t im_offset,
+                                   const cl_mem kernel_buffer, const size_t kernel_offset,
+                                   cl_mem result_buffer, const size_t result_offset,
+                                   cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to CONVGEMM:
+
+* `const KernelMode kernel_mode`: The kernel mode, either `KernelMode::kCrossCorrelation` for the normal mode, or `KernelMode::kConvolution` for the convolution mode that flips a kernel along `h` and `w` axes.
+* `const size_t channels`: Integer size argument. This value must be positive.
+* `const size_t height`: Integer size argument. This value must be positive.
+* `const size_t width`: Integer size argument. This value must be positive.
+* `const size_t kernel_h`: Integer size argument. This value must be positive.
+* `const size_t kernel_w`: Integer size argument. This value must be positive.
+* `const size_t pad_h`: Integer size argument. This value must be positive.
+* `const size_t pad_w`: Integer size argument. This value must be positive.
+* `const size_t stride_h`: Integer size argument. This value must be positive.
+* `const size_t stride_w`: Integer size argument. This value must be positive.
+* `const size_t dilation_h`: Integer size argument. This value must be positive.
+* `const size_t dilation_w`: Integer size argument. This value must be positive.
+* `const size_t num_kernels`: Integer size argument. This value must be positive.
+* `const size_t batch_count`: Integer size argument. This value must be positive.
+* `const cl_mem im_buffer`: OpenCL buffer to store the input im tensor.
+* `const size_t im_offset`: The offset in elements from the start of the input im tensor.
+* `const cl_mem kernel_buffer`: OpenCL buffer to store the input kernel tensor.
+* `const size_t kernel_offset`: The offset in elements from the start of the input kernel tensor.
+* `cl_mem result_buffer`: OpenCL buffer to store the output result tensor.
+* `const size_t result_offset`: The offset in elements from the start of the output result tensor.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xAXPYBATCHED: Batched version of AXPY
+-------------
+
+As AXPY, but multiple operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode AxpyBatched(const size_t n,
+                       const T *alphas,
+                       const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                       cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSaxpyBatched(const size_t n,
+                                      const float *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDaxpyBatched(const size_t n,
+                                      const double *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCaxpyBatched(const size_t n,
+                                      const cl_float2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZaxpyBatched(const size_t n,
+                                      const cl_double2 *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHaxpyBatched(const size_t n,
+                                      const cl_half *alphas,
+                                      const cl_mem x_buffer, const size_t *x_offsets, const size_t x_inc,
+                                      cl_mem y_buffer, const size_t *y_offsets, const size_t y_inc,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to AXPYBATCHED:
+
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const T *alphas`: Input scalar constants.
+* `const cl_mem x_buffer`: OpenCL buffer to store the input x vector.
+* `const size_t *x_offsets`: The offsets in elements from the start of the input x vector.
+* `const size_t x_inc`: Stride/increment of the input x vector. This value must be greater than 0.
+* `cl_mem y_buffer`: OpenCL buffer to store the output y vector.
+* `const size_t *y_offsets`: The offsets in elements from the start of the output y vector.
+* `const size_t y_inc`: Stride/increment of the output y vector. This value must be greater than 0.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+
+
+xGEMMBATCHED: Batched version of GEMM
+-------------
+
+As GEMM, but multiple operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode GemmBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                       const size_t m, const size_t n, const size_t k,
+                       const T *alphas,
+                       const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                       const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                       const T *betas,
+                       cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                       const size_t batch_count,
+                       cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const float *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const float *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const double *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const double *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_float2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_float2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_double2 *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_double2 *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemmBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                      const size_t m, const size_t n, const size_t k,
+                                      const cl_half *alphas,
+                                      const cl_mem a_buffer, const size_t *a_offsets, const size_t a_ld,
+                                      const cl_mem b_buffer, const size_t *b_offsets, const size_t b_ld,
+                                      const cl_half *betas,
+                                      cl_mem c_buffer, const size_t *c_offsets, const size_t c_ld,
+                                      const size_t batch_count,
+                                      cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GEMMBATCHED:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T *alphas`: Input scalar constants.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t *a_offsets`: The offsets in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t *b_offsets`: The offsets in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const T *betas`: Input scalar constants.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t *c_offsets`: The offsets in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GEMMBATCHED:
+
+* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
+* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+xGEMMSTRIDEDBATCHED: StridedBatched version of GEMM
+-------------
+
+As GEMM, but multiple strided operations are batched together for better performance.
+
+C++ API:
+```
+template <typename T>
+StatusCode GemmStridedBatched(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                              const size_t m, const size_t n, const size_t k,
+                              const T alpha,
+                              const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                              const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                              const T beta,
+                              cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                              const size_t batch_count,
+                              cl_command_queue* queue, cl_event* event)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                             const size_t m, const size_t n, const size_t k,
+                                             const float alpha,
+                                             const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                                             const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                                             const float beta,
+                                             cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                                             const size_t batch_count,
+                                             cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastDgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                             const size_t m, const size_t n, const size_t k,
+                                             const double alpha,
+                                             const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                                             const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                                             const double beta,
+                                             cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                                             const size_t batch_count,
+                                             cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastCgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                             const size_t m, const size_t n, const size_t k,
+                                             const cl_float2 alpha,
+                                             const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                                             const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                                             const cl_float2 beta,
+                                             cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                                             const size_t batch_count,
+                                             cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastZgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                             const size_t m, const size_t n, const size_t k,
+                                             const cl_double2 alpha,
+                                             const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                                             const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                                             const cl_double2 beta,
+                                             cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                                             const size_t batch_count,
+                                             cl_command_queue* queue, cl_event* event)
+CLBlastStatusCode CLBlastHgemmStridedBatched(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                                             const size_t m, const size_t n, const size_t k,
+                                             const cl_half alpha,
+                                             const cl_mem a_buffer, const size_t a_offset, const size_t a_ld, const size_t a_stride,
+                                             const cl_mem b_buffer, const size_t b_offset, const size_t b_ld, const size_t b_stride,
+                                             const cl_half beta,
+                                             cl_mem c_buffer, const size_t c_offset, const size_t c_ld, const size_t c_stride,
+                                             const size_t batch_count,
+                                             cl_command_queue* queue, cl_event* event)
+```
+
+Arguments to GEMMSTRIDEDBATCHED:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const T alpha`: Input scalar constant.
+* `const cl_mem a_buffer`: OpenCL buffer to store the input A matrix.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const size_t a_stride`: The (fixed) stride between two batches of the A matrix.
+* `const cl_mem b_buffer`: OpenCL buffer to store the input B matrix.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const size_t b_stride`: The (fixed) stride between two batches of the B matrix.
+* `const T beta`: Input scalar constant.
+* `cl_mem c_buffer`: OpenCL buffer to store the output C matrix.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `const size_t c_stride`: The (fixed) stride between two batches of the C matrix.
+* `const size_t batch_count`: Number of batches. This value must be positive.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `cl_event* event`: Pointer to an OpenCL event to be able to wait for completion of the routine's OpenCL kernel(s). This is an optional argument.
+
+Requirements for GEMMSTRIDEDBATCHED:
+
+* When `transpose_a == Transpose::kNo`, then `a_ld` must be at least `m`, otherwise `a_ld` must be at least `k`.
+* When `transpose_b == Transpose::kNo`, then `b_ld` must be at least `k`, otherwise `b_ld` must be at least `n`.
+* The value of `c_ld` must be at least `m`.
+
+
+
+GemmTempBufferSize: Retrieves the size of the temporary buffer for GEMM (auxiliary function)
+-------------
+
+Retrieves the required size of the temporary buffer for the GEMM kernel for specific arguments and for a specific device/platform and tuning parameters. This could be 0 in case no temporary buffer is required. Arguments are similar to those for GEMM.
+
+C++ API:
+```
+template <typename T>
+StatusCode GemmTempBufferSize(const Layout layout, const Transpose a_transpose, const Transpose b_transpose,
+                              const size_t m, const size_t n, const size_t k,
+                              const size_t a_offset, const size_t a_ld,
+                              const size_t b_offset, const size_t b_ld,
+                              const size_t c_offset, const size_t c_ld,
+                              cl_command_queue* queue, size_t& temp_buffer_size)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastSGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const size_t a_offset, const size_t a_ld,
+                               const size_t b_offset, const size_t b_ld,
+                               const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, size_t* temp_buffer_size)
+
+CLBlastStatusCode CLBlastDGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const size_t a_offset, const size_t a_ld,
+                               const size_t b_offset, const size_t b_ld,
+                               const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, size_t* temp_buffer_size)
+
+CLBlastStatusCode CLBlastCGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const size_t a_offset, const size_t a_ld,
+                               const size_t b_offset, const size_t b_ld,
+                               const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, size_t* temp_buffer_size)
+
+CLBlastStatusCode CLBlastZGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const size_t a_offset, const size_t a_ld,
+                               const size_t b_offset, const size_t b_ld,
+                               const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, size_t* temp_buffer_size)
+
+CLBlastStatusCode CLBlastHGemmTempBufferSize(const CLBlastLayout layout, const CLBlastTranspose a_transpose, const CLBlastTranspose b_transpose,
+                               const size_t m, const size_t n, const size_t k,
+                               const size_t a_offset, const size_t a_ld,
+                               const size_t b_offset, const size_t b_ld,
+                               const size_t c_offset, const size_t c_ld,
+                               cl_command_queue* queue, size_t* temp_buffer_size)
+```
+Arguments to GemmTempBufferSize:
+
+* `const Layout layout`: Data-layout of the matrices, either `Layout::kRowMajor` (101) for row-major layout or `Layout::kColMajor` (102) for column-major data-layout.
+* `const Transpose a_transpose`: Transposing the input matrix A, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const Transpose b_transpose`: Transposing the input matrix B, either `Transpose::kNo` (111), `Transpose::kYes` (112), or `Transpose::kConjugate` (113) for a complex-conjugate transpose.
+* `const size_t m`: Integer size argument. This value must be positive.
+* `const size_t n`: Integer size argument. This value must be positive.
+* `const size_t k`: Integer size argument. This value must be positive.
+* `const size_t a_offset`: The offset in elements from the start of the input A matrix.
+* `const size_t a_ld`: Leading dimension of the input A matrix. This value must be greater than 0.
+* `const size_t b_offset`: The offset in elements from the start of the input B matrix.
+* `const size_t b_ld`: Leading dimension of the input B matrix. This value must be greater than 0.
+* `const size_t c_offset`: The offset in elements from the start of the output C matrix.
+* `const size_t c_ld`: Leading dimension of the output C matrix. This value must be greater than 0.
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to execute the routine on.
+* `size_t& temp_buffer_size`: The result of this function: the required buffer size.
+
+
+
+ClearCache: Resets the cache of compiled binaries (auxiliary function)
+-------------
+
+CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache can be cleared to free up system memory or it can be useful in case of debugging.
+
+C++ API:
+```
+StatusCode ClearCache()
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastClearCache()
+```
+
+
+
+FillCache: Populates the cache of compiled binaries for a specific device (auxiliary function)
+-------------
+
+CLBlast stores binaries of compiled kernels into a cache in case the same kernel is used later on for the same device. This cache is automatically populated whenever a new binary is created. Thus, the first run of a specific kernel could take extra time. For debugging or performance evaluation purposes, it might be useful to populate the cache upfront. This function populates the cache for all kernels in CLBlast for all precisions, but for a specific device only.
+
+C++ API:
+```
+StatusCode FillCache(const cl_device_id device)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastFillCache(const cl_device_id device)
+```
+
+Arguments to FillCache:
+
+* `const cl_device_id device`: The OpenCL device to fill the cache for.
+
+
+
+RetrieveParameters: Retrieves current tuning parameters (auxiliary function)
+-------------
+
+This function retrieves current tuning parameters for a specific device-precision-kernel combination. This can be used for debugging or inspection. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid.
+
+C++ API:
+```
+StatusCode RetrieveParameters(const cl_device_id device, const std::string &kernel_name,
+                              const Precision precision,
+                              std::unordered_map<std::string,size_t> &parameters)
+```
+
+A C API is not available for this function.
+
+Arguments to RetrieveParameters (C++ version):
+
+* `const cl_device_id device`: The OpenCL device to query the parameters for.
+* `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code.
+* `const Precision precision`: The CLBlast precision enum to query the parameters for.
+* `std::unordered_map<std::string,size_t> &parameters`: An unordered map of strings to integers. This will be filled with the current tuning parameters for a specific kernel.
+
+
+
+OverrideParameters: Override tuning parameters (auxiliary function)
+-------------
+
+This function overrides tuning parameters for a specific device-precision-kernel combination. The next time the target routine is called it will be re-compiled and use the new parameters. All further times (until `OverrideParameters` is called again) it will load the kernel from the cache and thus continue to use the new parameters. Note that the first time after calling `OverrideParameters` a performance drop can be observable due to the re-compilation of the kernel. See [tuning.md](tuning.md) for more details on which kernel names and parameters are valid.
+
+C++ API:
+```
+StatusCode OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                              const Precision precision,
+                              const std::unordered_map<std::string,size_t> &parameters)
+```
+
+C API:
+```
+CLBlastStatusCode CLBlastOverrideParameters(const cl_device_id device, const char* kernel_name,
+                                            const CLBlastPrecision precision, const size_t num_parameters,
+                                            const char** parameters_names, const size_t* parameters_values)
+```
+
+Arguments to OverrideParameters (C++ version):
+
+* `const cl_device_id device`: The OpenCL device to set the new parameters for.
+* `const std::string &kernel_name`: The target kernel name. This has to be one of the existing CLBlast kernels (Xaxpy, Xdot, Xgemv, XgemvFast, XgemvFastRot, Xgemv, Xger, Copy, Pad, Transpose, Padtranspose, Xgemm, or XgemmDirect). If this argument is incorrect, this function will return with the `clblast::kInvalidOverrideKernel` status-code.
+* `const Precision precision`: The CLBlast precision enum to set the new parameters for.
+* `const std::unordered_map<std::string,size_t> &parameters`: An unordered map of strings to integers. This has to contain all the tuning parameters for a specific kernel as reported by the included tuners (e.g. `{ {"COPY_DIMX",8}, {"COPY_DIMY",32}, {"COPY_VW",4}, {"COPY_WPT",8} }` for the `Copy` kernel). If this argument is incorrect, this function will return with the `clblast::kMissingOverrideParameter` status-code.
+
+
+
+Tune<kernel_name>: Run the tuner for a particular kernel (advanced usage)
+-------------
+
+The CLBlast kernels can be tuned using the tuning binaries, but also programmatically through an API. This is only recommended for advanced usage, see for more information [the tuning docs](tuning.md).
+
+C++ API:
+```
+// Tunes the "Xaxpy" kernel, used for many level-1 routines such as XAXPY, XCOPY, and XSWAP
+template <typename T>
+StatusCode PUBLIC_API TuneXaxpy(cl_command_queue* queue, const size_t n,
+                                const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xdot" kernel, used for level-1 reduction routines such as XDOT, XMAX, and XSUM
+template <typename T>
+StatusCode PUBLIC_API TuneXdot(cl_command_queue* queue, const size_t n,
+                               const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xgemv" kernel, used for matrix-vector level-2 routines such as XGEMV, XGBMV, and XHEMV
+template <typename T>
+StatusCode PUBLIC_API TuneXgemv(cl_command_queue* queue, const size_t m, const size_t n,
+                                const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xger" kernel, used for matrix update level-2 routines such as XGER, XHER, and XSYR2
+template <typename T>
+StatusCode PUBLIC_API TuneXger(cl_command_queue* queue, const size_t m, const size_t n,
+                               const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xgemm" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode PUBLIC_API TuneXgemm(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
+                               const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "XgemmDiret" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode PUBLIC_API TuneXgemmDirect(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
+                                      const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Copy" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode PUBLIC_API TuneCopy(cl_command_queue* queue, const size_t m, const size_t n,
+                               const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Pad" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode PUBLIC_API TunePad(cl_command_queue* queue, const size_t m, const size_t n,
+                              const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Transpose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode PUBLIC_API TuneTranspose(cl_command_queue* queue, const size_t m, const size_t n,
+                                    const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Padtranspose" kernel, used for most level-3 routines such as XGEMM, XSYMM, and XHER2K
+template <typename T>
+StatusCode PUBLIC_API TunePadtranspose(cl_command_queue* queue, const size_t m, const size_t n,
+                                       const double fraction, std::unordered_map<std::string,size_t> &parameters);
+
+// Tunes the "Xgemm" kernel, used for the level-3 routine XTRSM
+template <typename T>
+StatusCode PUBLIC_API TuneInvert(cl_command_queue* queue, const size_t m, const size_t n, const size_t k,
+                                 const double fraction, std::unordered_map<std::string,size_t> &parameters);
+```
+
+Arguments to Tune<kernel_name> (C++ version):
+
+* `cl_command_queue* queue`: Pointer to an OpenCL command queue associated with a context and device to tune the kernel for.
+* `const size_t m`: The routine argument `m` to tune for (not applicable for all kernels)
+* `const size_t n`: The routine argument `n` to tune for
+* `const size_t k`: The routine argument `k` to tune for (not applicable for all kernels)
+* `const double fraction`: A value between 0.0 and 1.0 which determines the fraction of the tuning search space to explore.
+* `std::unordered_map<std::string,size_t> &parameters`: An unordered map of strings to integers. This will return the best found tuning parameters.
diff --git a/doc/benchmarking.md b/doc/benchmarking.md
new file mode 100644
index 00000000..2a14d81e
--- /dev/null
+++ b/doc/benchmarking.md
@@ -0,0 +1,26 @@
+CLBlast: Performance measuring and benchmarking
+================
+
+This document describes how to measure the performance of CLBlast and how to compare it against other libraries. For other information about CLBlast, see the [main README](../README.md).
+
+
+Compiling the performance tests ('clients')
+-------------
+
+To test the performance of CLBlast and to compare optionally against [clBLAS](http://github.com/clMathLibraries/clBLAS), cuBLAS (if testing on an NVIDIA GPU and `-DCUBLAS=ON` is set), or a CPU BLAS library (if installed), compile with the clients enabled by specifying `-DCLIENTS=ON`, for example as follows:
+
+    cmake -DCLIENTS=ON ..
+
+The performance tests come in the form of client executables named `clblast_client_xxxxx`, in which `xxxxx` is the name of a routine (e.g. `xgemm`). These clients take a bunch of configuration options and directly run CLBlast in a head-to-head performance test against optionally clBLAS and/or a CPU BLAS library. You can use the command-line options `-clblas 1`, `-cblas 1`, or `-cublas 1` to select a library to test against.
+
+
+Benchmarking
+-------------
+
+On [the CLBlast website](https://cnugteren.github.io/clblast) you will find performance results for various devices. Performance is compared in this case against a tuned version of the clBLAS library and optionally also against cuBLAS. Such graphs can be generated automatically on your own device as well. First, compile CLBlast with the clients enabled (see above). Then, make sure your installation of the reference clBLAS is performance-tuned by running the `tune` executable (shipped with clBLAS). Finally, run the Python/Matplotlib graph-script found in `scripts/benchmark/benchmark.py`. For example, to generate the SGEMM PDF on device 1 of platform 0 from the `build` subdirectory:
+
+    python ../scripts/benchmark/benchmark.py --platform 0 --device 1 --benchmark gemm
+
+Note that the CLBlast library provides pre-tuned parameter-values for some devices only: if your device is not among these, then out-of-the-box performance might be poor. See the [tuning README](tuning.md) to find out how to tune for your device.
+
+In case performance is still sub-optimal or something else is wrong, CLBlast can be build in verbose mode for (performance) debugging by specifying `-DVERBOSE=ON` to CMake.
diff --git a/doc/bindings.md b/doc/bindings.md
new file mode 100644
index 00000000..85508e68
--- /dev/null
+++ b/doc/bindings.md
@@ -0,0 +1,38 @@
+CLBlast: Bindings / wrappers for other languages
+================
+
+The main APIs of CLBlast are C and C++ for OpenCL or CUDA. This document describes other APIs for other languages through bindings and wrappers. For other information about CLBlast, see the [main README](../README.md).
+
+
+Plain C: Netlib BLAS API
+-------------
+
+CLBlast provides a Netlib CBLAS C API. This is however not recommended for performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severely. However, it can be useful if you don't want to touch OpenCL at all. Providing the `-DNETLIB=ON` flag to CMake at CLBlast compilation time will compile the Netlib API. Then, it can be used by including the corresponding header:
+
+    #include <clblast_netlib_c.h>
+
+The OpenCL device and platform can be set by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. 
+
+
+Python: PyCLBlast
+-------------
+
+PyCLBlast provides Python bindings for CLBlast. It is integrated in the main CLBlast project and can be installed through `pip`. Details can be found in the [PyCLBlast README](https://github.com/CNugteren/CLBlast/tree/master/src/pyclblast) or on [PyPi](https://pypi.python.org/pypi/pyclblast).
+
+
+Java: JOCLBlast (3rd party)
+-------------
+
+JOCLBlast is a 3rd party project providing bindings for Java. It is built on top of JOCL. Details can be found on the [JOCLBlast Github project page](https://github.com/gpu/JOCLBlast).
+
+
+Nim: nim-CLBlast (3rd party)
+-------------
+
+A 3rd party CLBlast wrapper for the nim language is available [here](https://github.com/numforge/nim-clblast).
+
+
+Julia: CLBlast.jl (3rd party)
+-------------
+
+A 3rd party CLBlast wrapper for [Julia](https://julialang.org/) is available [here](https://github.com/JuliaGPU/CLBlast.jl).
diff --git a/doc/details_conv.md b/doc/details_conv.md
new file mode 100644
index 00000000..65e18e70
--- /dev/null
+++ b/doc/details_conv.md
@@ -0,0 +1,22 @@
+CLBlast: Details on the CONVGEMM routine
+================
+
+This document gives a bit more detail on how the CONVGEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md).
+
+
+CONVGEMM: Two approaches
+-------------
+
+CLBlast implements two approaches to batched convolutions using GEMM: through im2col, or stand-alone:
+
+* `ConvGemmMethod::kWithIm2Col`: running first a batched version of im2col to prepare the data into a temporary buffer, and then running a batched version of GEMM. The implementation is just as the regular im2col and GEMM kernels in CLBlast, but it is implemented as a separate kernel so all the non-needed features can be stripped out and some optimizations can be made. It uses the tuning parameters of the regular im2col and GEMM kernels.
+
+* `ConvGemmMethod::kSingleKernel`: this is a single kernel approach: it loads the data in such a way that the im2col kernel is no longer needed, i.e. loading the data as the im2col transformation does it. That way it becomes a single kernel and there will be no need for an intermediate large buffer. It uses a separate set of tuning parameters, and can be tuned using the `clblast_tuner_xconvgemm` binary.
+
+
+CONVGEMM: Selecting which approach to use
+-------------
+
+Since CONVGEMM is a relatively new and experimental feature, selection of the approach is hard-coded in [xconvgemm.hpp on line 32](../src/routines/levelx/xconvgemm.hpp:32), but can be changed there in a single place.
+
+The main drawback of the `ConvGemmMethod::kWithIm2Col` approach is its extra memory usage, but depending on the device and setting, it might be faster compared to the `ConvGemmMethod::kSingleKernel` approach. The latter has as extra advantage that it has its own tuning parameters, so it can be fine-tuned for your specific use-case a bit better than the 2-kernel approach with im2col.
diff --git a/doc/details_gemm.md b/doc/details_gemm.md
new file mode 100644
index 00000000..d4666abb
--- /dev/null
+++ b/doc/details_gemm.md
@@ -0,0 +1,27 @@
+CLBlast: Details on the GEMM routine and kernel
+================
+
+This document gives a bit more detail on how the GEMM routine is organised and implemented. For other information about CLBlast, see the [main README](../README.md).
+
+
+GEMM: Two approaches
+-------------
+
+CLBlast implements two approaches to GEMM: direct and indirect:
+
+* Direct GEMM: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes).
+* Indirect GEMM: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices.
+
+
+GEMM: In-direct approach
+-------------
+
+Similar to the work by Matsumoto et al. ("Performance Tuning of Matrix Multiplication in OpenCL on Different GPUs and CPUs"), the main GEMM kernel makes many assumptions on the input arguments, which are handled by pre-processing and post-processing kernels. These assumptions are e.g. matrix sizes are a multiple of the work-group sizes, offsets are zero, and matrix B is transposed. This is a good solution for larger problem sizes since O(n^2) data movement is typically cheaper than O(n^3) computation, but the hidden constant starts to play a role for smaller n. Therefore, there is also a single-kernel direct version available for those cases, but it shares most of the design and parameters as discussed below.
+
+The main kernel has 14 different parameters, of which some are illustrated in figure 1 in the [CLBlast paper](https://arxiv.org/pdf/1705.05249). The parameters define among others the work-group sizes in 2 dimensions (MWG, NWG), the 2D register tiling configuration (MWI, NWI), the vector widths of both input matrices (VWM, VWN), loop unroll factors (KWI), and whether or not and how to use the local memory.
+
+
+GEMM: Direct approach
+-------------
+
+This is a single-kernel approach that shared many of the parameters for the in-direct kernel. One of the differences is that within the kernel there are checks for incomplete tiles in the m/n/k dimensions, influenced by the tuning parameters and the matrix sizes. These incomplete tiles will run a different part of the code, as they for example cannot benefit from vectorisation. Another difference is that there are dedicated kernels for each a/b transpose requirement: NN, NT, TN, TT for non-transposed and transposed.
+\ No newline at end of file
diff --git a/doc/faq.md b/doc/faq.md
new file mode 100644
index 00000000..2b5cbeaf
--- /dev/null
+++ b/doc/faq.md
@@ -0,0 +1,48 @@
+CLBlast: FAQ
+================
+
+This document answers some frequently asked questions. For other information about CLBlast, see the [main README](../README.md).
+
+
+What is the difference between the direct and indirect GEMM kernel?
+-------------
+
+There are two ways to perform GEMM implemented in CLBlast:
+
+* __Direct GEMM__: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes).
+* __Indirect GEMM__: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices.
+
+The GEMM routine tuner will find out from which m/n/k sizes onwards the indirect approach is favorable over the direct approach. Typically the direct approach is faster for small matrices.
+
+
+What is the difference between the GEMMK=0 and GEMMK=1 kernels?
+-------------
+
+For the indirect GEMM kernel (see above) there are basically two implementations, an older approach (GEMMK=0) and a newer kernel with 2D register tiling and support for shuffling (GEMMK=1). On most device the old approach is still the fastest, but some devices can benefit more from the other kernel. The regular GEMM kernel tuner will explore both kernels, making sure to select the fastest one.
+
+
+
+The GEMM tuner runs in 4 stages, what are they?
+-------------
+
+The regular GEMM tuner tunes the indirect kernel (see above), tuning for the GEMMK=0 kernel first (stage 1/4 and 2/4) followed by the GEMMK=1 variant (stage 3/4 and 4/4). In both cases, first a fixed set of likely-to-be-good parameters is explored fully (1/4 and 3/4), followed by a random selection of parameters in a much larger search space (2/4 and 4/4). In the end the library will only care about the final best kernel configuration among all 4 stages.
+
+The direct GEMM tuner runs in 2 stages: as above, it first explores a small set of parameters exhaustively, followed by a random selection of a larger search space.
+
+
+The GEMM routine uses too much memory or results in error -4, what can I do?
+-------------
+
+By design the indirect version of the GEMM kernel might allocate some temporarily memory on your device, and that might be an issue in some scenarios. However, there are a few things you could do to avoid this:
+
+* Use the [override parameters](../include/clblast.h#L717) functionality to set the switching point between direct and in-direct kernels much further. Example [here in one of the tests](../test/routines/level3/xgemm.hpp#L73). This might affect the performance of the GEMM routine.
+
+* [Query the required buffer size](../include/clblast.h#L691), allocate the buffer yourself, and pass that to [GEMM](../include/clblast.h#L525). That way you are in control and can make sure it is only allocated once for example.
+
+* Make sure no temporary buffer is required. Thus, make sure the buffer size is already a multiple of the amount of work done per work-group, e.g. 32, 64 or 128 at most depending on the tuned values for your device (you can query them if wanted). Then also make sure they are pre-transposed as needed. The [query-temp-buffer-size function](../include/clblast.h#L691) and its implementation can help you figure out if you are there yet.
+
+
+The tuners occasionally report failures or errors, is this an issue?
+-------------
+
+The tuners explore many different kernel parameters, sometimes quite extreme, seeking the bounds of the hardware or resulting in very large binaries. Depending on your device and OpenCL implementation, it might well be that failures occur. However, the tuner will automatically detect incorrect results or failed kernels, and will skip them. Only if the amount of failures is very large, something might be wrong in the CLBlast code. In that case, it can be reported as an issue.
diff --git a/doc/glossary.md b/doc/glossary.md
new file mode 100644
index 00000000..821ffc69
--- /dev/null
+++ b/doc/glossary.md
@@ -0,0 +1,14 @@
+CLBlast: Glossary
+================
+
+This document describes some commonly used terms in CLBlast documentation and code. For other information about CLBlast, see the [main README](../README.md).
+
+* __BLAS__: The set of 'Basic Linear Algebra Subroutines'.
+* __Netlib BLAS__: The official BLAS API definition, with __CBLAS__ providing the C headers. 
+* __OpenCL__: The open compute language, a Khronos standard for heterogeneous and parallel computing, e.g. on GPUs.
+* __kernel__: An OpenCL parallel program that runs on the target device.
+* __clBLAS__: Another OpenCL BLAS library, maintained by AMD.
+* __cuBLAS__: The main CUDA BLAS library, maintained by NVIDIA.
+* __GEMM__: The 'GEneral Matrix Multiplication' routine.
+* __Direct GEMM__: Computing GEMM using a single generic kernel which handles all cases (e.g. all kinds of matrix sizes).
+* __Indirect GEMM__: Computing GEMM using multiple kernels: the main GEMM kernel and a few pre-processing and post-processing kernels. The main kernel makes several assumptions (e.g. sizes need to be multiples of 32), which the other kernels make sure are satisfied. The main kernel is often faster than the generic kernel of the direct approach, but the cost of pre-processing and post-processing kernels can sometimes be high for small sizes or particular devices.
diff --git a/doc/installation.md b/doc/installation.md
new file mode 100644
index 00000000..70c69321
--- /dev/null
+++ b/doc/installation.md
@@ -0,0 +1,121 @@
+CLBlast: Building and installing
+================
+
+This document describes how to compile, link, and install CLBlast on various platforms. You can either use a pre-built package or compile the library from source. For other information about CLBlast, see the [main README](../README.md).
+
+
+Requirements
+-------------
+
+The pre-requisites for compilation of CLBlast are kept as minimal as possible. A basic compilation infrastructure is all you need, no external dependencies are required. You'll need:
+
+* CMake version 2.8.10 or higher
+* A C++11 compiler, for example:
+  - GCC 4.7.0 or newer
+  - Clang 3.3 or newer
+  - AppleClang 5.0 or newer
+  - ICC 14.0 or newer
+  - MSVC (Visual Studio) 2013 or newer
+* An OpenCL 1.1 or newer library, for example:
+  - Apple OpenCL
+  - NVIDIA CUDA SDK
+  - AMD APP SDK
+  - Intel OpenCL
+  - Beignet
+  - Mesa Clover
+  - ARM Mali OpenCL
+  - Vivante OpenCL
+  - POCL
+
+
+Using pre-built packages
+-------------
+
+There are pre-built binaries available for Ubuntu, macOS, and Windows.
+
+For Ubuntu, CLBlast is available through [a PPA](https://launchpad.net/~cnugteren/+archive/ubuntu/clblast). The sources for the Debian packaging can be found [in a separate repository](https://github.com/CNugteren/CLBlast-packaging). CLBlast can be installed as follows on Ubuntu 16.04:
+
+    sudo add-apt-repository ppa:cnugteren/clblast
+    sudo apt-get update
+    sudo apt-get install libclblast-dev
+
+For Arch Linux and Manjaro, CLBlast is available as a [package](https://aur.archlinux.org/packages/clblast-git) maintained by a 3rd party.
+
+For OS X / macOS, CLBlast is available through [Homebrew](https://github.com/Homebrew/homebrew-core/blob/master/Formula/clblast.rb). It can be installed as follows:
+
+    brew update
+    brew install clblast
+
+For Windows, binaries are provided in a .zip file on Github as part of the [CLBlast release page](https://github.com/CNugteren/CLBlast/releases).
+
+
+Linux / macOS compilation from source
+-------------
+
+Configuration can be done using CMake. On Linux and macOS systems with make, building is straightforward. Here's an example of an out-of-source build using a command-line compiler and make (starting from the root of the CLBlast folder):
+
+    mkdir build
+    cd build
+    cmake ..
+    make
+    sudo make install  # (optional)
+
+A custom installation folder can be specified when calling CMake:
+
+    cmake -DCMAKE_INSTALL_PREFIX=/path/to/install/directory ..
+
+Building a static version of the library instead of shared one (.dylib/.so) can be done by disabling the `BUILD_SHARED_LIBS` option when calling CMake. For example:
+
+    cmake -DBUILD_SHARED_LIBS=OFF ..
+
+In case you run into segfaults with OpenCL programs (known to happen with the AMD APP), you can try the following (thanks to [kpot](https://github.com/CNugteren/CLBlast/issues/243#issuecomment-367277297)):
+
+1. Use `-fPIC` or its analogue when compiling. In CMake you can do this by adding `set(CMAKE_POSITION_INDEPENDENT_CODE ON)` to the project config.
+
+2. Forbid CMake to add RPATH entries to binaries. You can do this project-wise with `set(CMAKE_SKIP_BUILD_RPATH ON)` in CMake.
+
+
+Windows compilation from source
+-------------
+
+When using Visual Studio 2015, the project-files can be generated as follows:
+
+    mkdir build
+    cd build
+    cmake -G "Visual Studio 14 Win64" ..
+
+For another version, replace 14 with the appropriate version (12 for VS 2013, 15 for VS 2017). To generate a static version of the library instead of a .dll, specify `-DBUILD_SHARED_LIBS=OFF` when running cmake.
+
+
+Android compilation from source
+-------------
+
+For deployment on Android, there are three options to consider.
+
+First of all, you can use Google's recommended route of installing Android Studio with the NDK, and then use the JNI to interface to the CLBlast library. For this, we refer to the official Android Studio documentation and the online tutorials.
+
+Alternatively, you can cross-compile the library and the test/client/tuner executables directly. To do so, first install the NDK, then find your vendor's OpenCL library (e.g. in `/system/vendor/lib`), get OpenCL headers from the Khronos registry, and invoke CMake as follows:
+
+    cmake .. \
+     -DCMAKE_SYSTEM_NAME=Android \
+     -DCMAKE_SYSTEM_VERSION=19 \             # Set the appropriate Android API level
+     -DCMAKE_ANDROID_ARCH_ABI=armeabi-v7a \  # Set the appropriate device architecture (e.g. armeabi-v7a or arm64-v8a)
+     -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \ # Assumes $ANDROID_NDK_PATH points to your NDK installation
+     -DCMAKE_ANDROID_STL_TYPE=gnustl_static \
+     -DOPENCL_ROOT=/path/to/vendor/OpenCL/lib/folder/   # Should contain libOpenCL.so and CL/cl.h
+
+For any potential issues, first check [cmath 'has not been declared' errors](https://stackoverflow.com/questions/45183525/compilation-error-with-ndk-using-cstatic/46433625). Also, if you are encountering errors such as `#error Bionic header ctype.h does not define either _U nor _CTYPE_U`, make sure CMake is not including system paths.
+
+Finally, a third option is to use the [Collective Knowledge framework](https://github.com/ctuning/ck) in combination with the NDK, e.g. as follows:
+
+    sudo pip install ck
+    ck pull repo:ck-math
+    ck install package:lib-clblast-master-universal --target_os=android21-arm64
+
+
+Compiling CLBlast with a CUDA back-end
+-------------
+
+There is also a CUDA API of CLBlast available. Enabling this compiles the whole library for CUDA and thus replaces the OpenCL API. It is based upon the CUDA runtime and NVRTC APIs, requiring NVIDIA CUDA 7.5 or higher. The CUDA version of the library can be used as follows after providing the `-DCUDA=ON -DOPENCL=OFF` flags to CMake:
+
+    #include <clblast_cuda.h>
diff --git a/doc/routines.md b/doc/routines.md
new file mode 100644
index 00000000..3ba8283e
--- /dev/null
+++ b/doc/routines.md
@@ -0,0 +1,112 @@
+CLBlast: Supported routines overview
+================
+
+This document describes which routines are supported in CLBlast. For other information about CLBlast, see the [main README](../README.md).
+
+Full API documentation is available in a separate [API documentation file](api.md).
+
+
+Supported types
+-------------
+
+The different data-types supported by the library are:
+
+* __S:__ Single-precision 32-bit floating-point (`float`).
+* __D:__ Double-precision 64-bit floating-point (`double`).
+* __C:__ Complex single-precision 2x32-bit floating-point (`std::complex<float>`).
+* __Z:__ Complex double-precision 2x64-bit floating-point (`std::complex<double>`).
+* __H:__ Half-precision 16-bit floating-point (`cl_half`). See section 'Half precision' below for more information.
+
+
+Supported routines
+-------------
+
+CLBlast supports almost all the Netlib BLAS routines plus a couple of extra non-BLAS routines. The supported BLAS routines are marked with '✔' in the following tables. Routines marked with '-' do not exist: they are not part of BLAS at all.
+
+| Level-1  | S | D | C | Z | H |
+| ---------|---|---|---|---|---|
+| xSWAP    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xSCAL    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xCOPY    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xAXPY    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xDOT     | ✔ | ✔ | - | - | ✔ |
+| xDOTU    | - | - | ✔ | ✔ | - |
+| xDOTC    | - | - | ✔ | ✔ | - |
+| xNRM2    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xASUM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| IxAMAX   | ✔ | ✔ | ✔ | ✔ | ✔ |
+
+| Level-2  | S | D | C | Z | H |
+| ---------|---|---|---|---|---|
+| xGEMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGBMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHEMV    | - | - | ✔ | ✔ | - |
+| xHBMV    | - | - | ✔ | ✔ | - |
+| xHPMV    | - | - | ✔ | ✔ | - |
+| xSYMV    | ✔ | ✔ | - | - | ✔ |
+| xSBMV    | ✔ | ✔ | - | - | ✔ |
+| xSPMV    | ✔ | ✔ | - | - | ✔ |
+| xTRMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTBMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTPMV    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGER     | ✔ | ✔ | - | - | ✔ |
+| xGERU    | - | - | ✔ | ✔ | - |
+| xGERC    | - | - | ✔ | ✔ | - |
+| xHER     | - | - | ✔ | ✔ | - |
+| xHPR     | - | - | ✔ | ✔ | - |
+| xHER2    | - | - | ✔ | ✔ | - |
+| xHPR2    | - | - | ✔ | ✔ | - |
+| xSYR     | ✔ | ✔ | - | - | ✔ |
+| xSPR     | ✔ | ✔ | - | - | ✔ |
+| xSYR2    | ✔ | ✔ | - | - | ✔ |
+| xSPR2    | ✔ | ✔ | - | - | ✔ |
+| xTRSV    | ✔ | ✔ | ✔ | ✔ |   |
+
+| Level-3  | S | D | C | Z | H |
+| ---------|---|---|---|---|---|
+| xGEMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xSYMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHEMM    | - | - | ✔ | ✔ | - |
+| xSYRK    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHERK    | - | - | ✔ | ✔ | - |
+| xSYR2K   | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xHER2K   | - | - | ✔ | ✔ | - |
+| xTRMM    | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xTRSM    | ✔ | ✔ | ✔ | ✔ |   |
+
+Furthermore, there are also batched versions of BLAS routines available, processing multiple smaller computations in one go for better performance:
+
+| Batched             | S | D | C | Z | H |
+| --------------------|---|---|---|---|---|
+| xAXPYBATCHED        | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGEMMBATCHED        | ✔ | ✔ | ✔ | ✔ | ✔ |
+| xGEMMSTRIDEDBATCHED | ✔ | ✔ | ✔ | ✔ | ✔ |
+
+In addition, some extra non-BLAS routines are also supported by CLBlast, classified as level-X. They are experimental and should be used with care:
+
+| Level-X    | S | D | C | Z | H |
+| -----------|---|---|---|---|---|
+| xSUM       | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to xASUM, but not absolute)
+| IxAMIN     | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to IxAMAX, but minimum instead of maximum)
+| IxMAX      | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to IxAMAX, but not absolute)
+| IxMIN      | ✔ | ✔ | ✔ | ✔ | ✔ | (Similar to IxAMAX, but not absolute and minimum instead of maximum)
+| xHAD       | ✔ | ✔ | ✔ | ✔ | ✔ | (Hadamard product)
+| xOMATCOPY  | ✔ | ✔ | ✔ | ✔ | ✔ | (Out-of-place copying/transposing/scaling of matrices)
+| xIM2COL    | ✔ | ✔ | ✔ | ✔ | ✔ | (Image to column transform as used to express convolution as GEMM)
+| xCOL2IM    | ✔ | ✔ | ✔ | ✔ | ✔ | (Column to image transform as used in machine learning)
+| xCONVGEMM  | ✔ | ✔ | - | - | ✔ | (Experimental, implemented as either im2col followed by batched GEMM or as a single kernel)
+
+Some less commonly used BLAS routines are not yet supported by CLBlast. They are xROTG, xROTMG, xROT, xROTM, xTBSV, and xTPSV.
+
+
+Half precision (fp16)
+-------------
+
+The half-precision fp16 format is a 16-bits floating-point data-type. Some OpenCL devices support the `cl_khr_fp16` extension, reducing storage and bandwidth requirements by a factor 2 compared to single-precision floating-point. In case the hardware also accelerates arithmetic on half-precision data-types, this can also greatly improve compute performance of e.g. level-3 routines such as GEMM. Devices which can benefit from this are among others Intel GPUs, ARM Mali GPUs, and NVIDIA's latest Pascal GPUs. Half-precision is in particular interest for the deep-learning community, in which convolutional neural networks can be processed much faster at a minor accuracy loss.
+
+Since there is no half-precision data-type in C or C++, OpenCL provides the `cl_half` type for the host device. Unfortunately, internally this translates to a 16-bits integer, so computations on the host using this data-type should be avoided. For convenience, CLBlast provides the `clblast_half.h` header (C99 and C++ compatible), defining the `half` type as a short-hand to `cl_half` and the following basic functions:
+
+* `half FloatToHalf(const float value)`: Converts a 32-bits floating-point value to a 16-bits floating-point value.
+* `float HalfToFloat(const half value)`: Converts a 16-bits floating-point value to a 32-bits floating-point value.
+
+The [samples/haxpy.c](../samples/haxpy.c) example shows how to use these convenience functions when calling the half-precision BLAS routine HAXPY.
diff --git a/doc/testing.md b/doc/testing.md
new file mode 100644
index 00000000..91cf0828
--- /dev/null
+++ b/doc/testing.md
@@ -0,0 +1,30 @@
+CLBlast: Testing the library for correctness
+================
+
+This document describes how to test the library. For other information about CLBlast, see the [main README](../README.md).
+
+
+Compiling the correctness tests
+-------------
+
+To make sure CLBlast is working correctly on your device (recommended), compile with the tests enabled by specifying `-DTESTS=ON`, for example as follows:
+
+    cmake -DTESTS=ON ..
+
+To build these tests, another BLAS library is needed to serve as a reference. This can be either:
+
+* The OpenCL BLAS library [clBLAS](http://github.com/clMathLibraries/clBLAS) (maintained by AMD)
+* A regular CPU Netlib BLAS library, e.g.:
+  - OpenBLAS
+  - BLIS
+  - Accelerate
+
+Afterwards, executables in the form of `clblast_test_xxxxx` are available, in which `xxxxx` is the name of a routine (e.g. `xgemm`). 
+
+
+Running the tests
+-------------
+
+All tests can be run as individual executables or directly together in one go through the `make alltests` target or using CTest (`make test` or `ctest`). In the latter case the output is less verbose. Both cases allow you to set the default device and platform to non-zero by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Further options can be supplied through the `CLBLAST_ARGUMENTS` environmental variable (e.g. export CLBLAST_ARGUMENTS="-full_test -cblas 1 -clblas 0" on a UNIX system).
+
+Note that CLBlast is tested for correctness against [clBLAS](http://github.com/clMathLibraries/clBLAS) and/or a regular CPU BLAS library. If both are installed on your system, setting the command-line option `-clblas 1` or `-cblas 1` will select the library to test against for the `clblast_test_xxxxx` executables. All tests have a `-verbose` option to enable additional diagnostic output. They also have a `-full_test` option to increase coverage further.
diff --git a/doc/tuning.md b/doc/tuning.md
new file mode 100644
index 00000000..20af9bd5
--- /dev/null
+++ b/doc/tuning.md
@@ -0,0 +1,241 @@
+CLBlast: Tuning for better performance
+================
+
+This document describes how to tune CLBlast for better performance and lists for which devices tuned kernels are already available. For other information about CLBlast, see the [main README](../README.md).
+
+
+Already tuned-for devices
+-------------
+
+The CLBlast library is already tuned for the most commonly used OpenCL devices and it's gradually being extended to other devices as well. For unseen devices CLBlast will make use of common-best tuning values for similar architectures (e.g. AMD Fiji) or in general similar devices (e.g. AMD GPUs), so performance might still be decent. The current release of CLBlast is tuned for the following devices:
+
+* NVIDIA GPUs:
+  - GRID K520
+  - GeForce GT 650M
+  - GeForce GTX 480
+  - GeForce GTX 580
+  - GeForce GTX 670
+  - GeForce GTX 680
+  - GeForce GTX 750
+  - GeForce GTX 750 Ti
+  - GeForce GTX 760 Ti
+  - GeForce GTX 920MX
+  - GeForce GTX 970
+  - GeForce GTX 980
+  - GeForce GTX 1070
+  - GeForce GTX 1070 Ti
+  - GeForce GTX 1080
+  - GeForce GTX 1080 Ti
+  - GeForce GTX TITAN
+  - GeForce GTX TITAN Black
+  - GeForce GTX TITAN X
+  - TITAN X (Pascal)
+  - Tesla K20m
+  - Tesla K40m
+  - Tesla P100 16GB
+* AMD GPUs:
+  - Radeon HD 6750M
+  - Radeon HD 6770M
+  - Radeon HD 7970
+  - Radeon R9 270X
+  - Radeon R9 290X
+  - Radeon R9 M370X
+  - Radeon R9 380
+  - Radeon RX 480
+  - Radeon R9 Fury X
+  - Radeon Pro 580
+* Intel GPUs:
+  - HD Graphics 530
+  - HD Graphics 5500 BroadWell U-Processor GT2
+  - HD Graphics 6000 BroadWell U-Processor GT3
+  - HD Graphics 630
+  - HD Graphics Haswell Ultrabook GT2 Mobile
+  - HD Graphics IvyBridge M GT2
+  - HD Graphics Skylake ULT GT2
+  - Iris
+  - Iris Pro
+* Intel CPUs:
+  - Core i5-4570
+  - Core i5-4590S
+  - Core i5-6200U
+  - Core i7-920
+  - Core i7-2670QM
+  - Core i7-3770K
+  - Core i7-4790K
+  - Core i7-5930K
+  - Core i7-6770HQ
+  - Xeon E5-2630 v3
+  - Xeon E5-2630 v4
+* Other devices:
+  - ARM Mali-T628 GPU
+  - ARM Mali-T760 GPU
+  - Qualcomm Adreno 330 GPU
+  - Intel MIC
+
+If your device is not (yet) among this list or if you want to tune CLBlast for specific parameters (e.g. rectangular matrix sizes), you should run the included tuners.
+
+
+Compiling and running the tuners
+-------------
+
+The included CLBlast tuners are compiled with the default CMake options. If they are not compiled, make sure you are specifing `-DTUNERS=ON`, for example as follows:
+
+    cmake -DTUNERS=ON ..
+
+Compiling with `-DTUNERS=ON` will generate a number of tuners, each named `clblast_tuner_xxxxx`, in which `xxxxx` corresponds to a `.opencl` kernel file as found in `src/kernels`. These kernels corresponds to routines (e.g. `xgemm`) or to common pre-processing or post-processing kernels (`copy` and `transpose`). Running such a tuner will test a number of parameter-value combinations on your device and report which one gave the best performance. Running `make alltuners` runs all tuners for all precisions in one go. You can set the default device and platform for `alltuners` by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. Alternatively, you can also manually run each of the tuners for each of the precisions. Here's an example to tune the `axpy` kernels for 64-bit precision on device 0 of platform 0:
+
+    ./clblast_tuner_xaxpy --precision 64 --device 0 --platform 0
+
+The kernels `gemm` and `gemm_direct` have too many parameters to explore. Therefore, they will run in two stages: a first stage with a fixed limited number of parameter combinations, and a second stage with a random selection from a much larger search space. The random fraction is determined by the `fraction` argument on the command-line.
+
+There are also several routine-level tuners. They tune inter-kernel parameters and should only be run after the kernels are tuned. However, they do automatically pick up kernel tuning results from the current folder if there are any. An example is the GEMM routine tuner, which determines when to use the direct or the in-direct GEMM kernel.
+
+Here are all the tuners included in the `make alltuners` target (in the same order) with all their precision arguments:
+
+    ./clblast_tuner_copy_fast -precision 32
+    ./clblast_tuner_copy_fast -precision 64
+    ./clblast_tuner_copy_fast -precision 3232
+    ./clblast_tuner_copy_fast -precision 6464
+    ./clblast_tuner_copy_fast -precision 16
+    ./clblast_tuner_copy_pad -precision 32
+    ./clblast_tuner_copy_pad -precision 64
+    ./clblast_tuner_copy_pad -precision 3232
+    ./clblast_tuner_copy_pad -precision 6464
+    ./clblast_tuner_copy_pad -precision 16
+    ./clblast_tuner_transpose_fast -precision 32
+    ./clblast_tuner_transpose_fast -precision 64
+    ./clblast_tuner_transpose_fast -precision 3232
+    ./clblast_tuner_transpose_fast -precision 6464
+    ./clblast_tuner_transpose_fast -precision 16
+    ./clblast_tuner_transpose_pad -precision 32
+    ./clblast_tuner_transpose_pad -precision 64
+    ./clblast_tuner_transpose_pad -precision 3232
+    ./clblast_tuner_transpose_pad -precision 6464
+    ./clblast_tuner_transpose_pad -precision 16
+    ./clblast_tuner_xaxpy -precision 32
+    ./clblast_tuner_xaxpy -precision 64
+    ./clblast_tuner_xaxpy -precision 3232
+    ./clblast_tuner_xaxpy -precision 6464
+    ./clblast_tuner_xaxpy -precision 16
+    ./clblast_tuner_xdot -precision 32
+    ./clblast_tuner_xdot -precision 64
+    ./clblast_tuner_xdot -precision 3232
+    ./clblast_tuner_xdot -precision 6464
+    ./clblast_tuner_xdot -precision 16
+    ./clblast_tuner_xger -precision 32
+    ./clblast_tuner_xger -precision 64
+    ./clblast_tuner_xger -precision 3232
+    ./clblast_tuner_xger -precision 6464
+    ./clblast_tuner_xger -precision 16
+    ./clblast_tuner_xgemm -precision 32
+    ./clblast_tuner_xgemm -precision 64
+    ./clblast_tuner_xgemm -precision 3232
+    ./clblast_tuner_xgemm -precision 6464
+    ./clblast_tuner_xgemm -precision 16
+    ./clblast_tuner_xgemm_direct -precision 32
+    ./clblast_tuner_xgemm_direct -precision 64
+    ./clblast_tuner_xgemm_direct -precision 3232
+    ./clblast_tuner_xgemm_direct -precision 6464
+    ./clblast_tuner_xgemm_direct -precision 16
+    ./clblast_tuner_xgemv -precision 32
+    ./clblast_tuner_xgemv -precision 64
+    ./clblast_tuner_xgemv -precision 3232
+    ./clblast_tuner_xgemv -precision 6464
+    ./clblast_tuner_xgemv -precision 16
+    ./clblast_tuner_invert -precision 32
+    ./clblast_tuner_invert -precision 64
+    ./clblast_tuner_invert -precision 3232
+    ./clblast_tuner_invert -precision 6464
+    ./clblast_tuner_invert -precision 16
+    ./clblast_tuner_routine_xgemm -precision 32
+    ./clblast_tuner_routine_xgemm -precision 64
+    ./clblast_tuner_routine_xgemm -precision 3232
+    ./clblast_tuner_routine_xgemm -precision 6464
+    ./clblast_tuner_routine_xgemm -precision 16
+    ./clblast_tuner_routine_xtrsv -precision 32
+    ./clblast_tuner_routine_xtrsv -precision 64
+    ./clblast_tuner_routine_xtrsv -precision 3232
+    ./clblast_tuner_routine_xtrsv -precision 6464
+    ./clblast_tuner_routine_xtrsv -precision 16
+
+
+Using the tuning results
+-------------
+
+The tuners output a JSON-file with the results. The best results need to be added to `src/database/kernels/xxxxx.hpp` in the appropriate section. However, this can be done automatically based on the JSON-data using a Python (2.7 or 3.x) script in `scripts/database/database.py`. If you want the found parameters to be included in future releases of CLBlast, please attach the JSON files to the corresponding issue on GitHub or [email the main author](http://www.cedricnugteren.nl).
+
+In summary, tuning the entire library for your device can be done as follows (starting from the root of the CLBlast folder):
+
+    mkdir build
+    cd build
+    cmake -DTUNERS=ON ..
+    make
+    make alltuners
+    python ../scripts/database/database.py . ..
+    make
+
+
+Tuning using the API (advanced users only)
+-------------
+
+Apart from running the tuning binaries, it is also possible to run the tuners programmatically through the CLBlast API. This could be useful if you want to tune for non-standard arguments (e.g. a rectangular or very small matrix). The tuning results can then also be set programmatically using `OverrideParameters`.
+
+The tuning API does not perform any disk or stdout I/O, thus it is not possible to track progress. Running the regular tuner binaries should give an idea of the amount of configurations to explore for a particular device, thus giving an indication of a good value for the `fraction` argument (see the [API documentation](api.md) for more details).
+
+
+Inspecting and changing tuning parameters at run-time
+-------------
+
+Alternatively, you can also supply your tuning parameters programmatically through the CLBlast API. This is especially useful if you tune for specific non-standard arguments (e.g. a rectangular or a very small matrix). To do so, you can call the `OverrideParameters` function which will set new parameters for a specific kernel. At the first next call of the target routine, CLBlast will compile a new binary and use it together with the new parameters from then on. Until `OverrideParameters` is called again of course. This is the API:
+
+    StatusCode PUBLIC_API OverrideParameters(const cl_device_id device, const std::string &kernel_name,
+                                             const Precision precision,
+                                             const std::unordered_map<std::string,size_t> &parameters)
+
+To inspect current behaviour, you can also retrieve the parameters for a specific device and kernel combination:
+
+    StatusCode PUBLIC_API RetrieveParameters(const cl_device_id device, const std::string &kernel_name,
+                                             const Precision precision,
+                                             std::unordered_map<std::string,size_t> &parameters)
+
+These two functions require/retrieve the parameters as given in [src/database/kernels](../src/database/kernels), i.e.:
+
+| Kernel name         | Parameters            |
+| --------------------|-----------------------|
+| Xaxpy               |  VW, WGS, WPT         |
+| Xdot                |  WGS1, WGS2           |
+| Xgemv               |  WGS1, WPT1           |
+| XgemvFast           |  VW2, WGS2, WPT2      |
+| XgemvFastRot        |  VW3, WGS3, WPT3      |
+| Xger                |  WGS1, WGS2, WPT      |
+| Xtrsv               |  TRSV_BLOCK_SIZE      |
+| Xgemm               |  GEMMK, KREG, KWG, KWI, MDIMA, MDIMC, MWG, NDIMB, NDIMC, NWG, SA, SB, STRM, STRN, VWM, VWN |
+| XgemmDirect         |  KWID, MDIMAD, MDIMCD, NDIMBD, NDIMCD, PADA, PADB, VWMD, VWND, WGD |
+| Copy                |  COPY_DIMX, COPY_DIMY, COPY_VW, COPY_WPT |
+| Pad                 |  PAD_DIMX, PAD_DIMY, PAD_WPTX, PAD_WPTY |
+| Transpose           |  TRA_DIM, TRA_PAD, TRA_SHUFFLE, TRA_WPT |
+| Padtranspose        |  PADTRA_PAD, PADTRA_TILE, PADTRA_WPT |
+| Invert              |  INTERNAL_BLOCK_SIZE  |
+| TrsvRoutine         |  TRSV_BLOCK_SIZE      |
+
+
+Tuning OpenCL compiler options
+-------------
+
+For all of CLBlast's APIs, it is possible to optionally set an OS environmental variable `CLBLAST_BUILD_OPTIONS` to pass specific build options to the OpenCL compiler. Also make sure this is set in the same way when running the tuners.
+
+
+Which kernels are used for which routines?
+-------------
+
+To find out which tuners to run for which routines, you can use the table below. The kernel names correspond to the tuner binaries, the tuner API, and to the arguments for `OverrideParameters` and `RetrieveParameters`.
+
+| Routines                                                                 | Kernel(s) / Tuner(s)            |
+| -------------------------------------------------------------------------|---------------------------------|
+| AXPY COPY SCAL SWAP OMATCOPY AXPYBATCHED                                 | Xaxpy                           |
+| AMAX ASUM DOT DOTC DOTU NRM2 SUM MAX MIN AMIN                            | Xdot                            |
+| GBMV GEMV HBMV HEMV HPMV SBMV SPMV SYMV TMBV TPMV TRMV TRSV              | Xgemv                           |
+| GER GERC GERU HER HER2 HPR HPR2 SPR SPR2 SYR SYR2                        | Xger                            |
+| GEMM HEMM HER2K HERK SYMM SYR2K SYRK TRMM GEMMBATCHED GEMMSTRIDEDBATCHED | Xgemm XgemmDirect Copy Pad Transpose Padtranspose |
+| TRSM                                                                     | Xgemm XgemmDirect Copy Pad Transpose Padtranspose Invert |
+| IM2COL COL2IM                                                            | Copy                            |