summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2016-10-25 20:37:33 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2016-10-25 20:37:33 +0200
commitbb14a5880efea3bb8a80a53bf45fc0c5378d5db6 (patch)
treebc66e45a050e3c1a9bc9af2ab9f1a4b40cd20324
parent8ae8ab06a2b6f24faa0de5d390a5ae272aa94c23 (diff)
Added an example and documentation for the Netlib CBLAS API
-rw-r--r--CHANGELOG1
-rw-r--r--CMakeLists.txt2
-rw-r--r--README.md4
-rw-r--r--samples/sgemm_netlib.c69
4 files changed, 75 insertions, 1 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 48305f03..efe614cb 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,7 @@
Development version (next release)
- Updated to version 8.0 of the CLCudaAPI C++11 OpenCL header
- Changed the enums in the C API to avoid potential name clashes with external code
+- Added a Netlib CBLAS compatible API (not recommended for full control over performance)
- Greatly improved the way exceptions are handled in the library (thanks to 'intelfx')
- Improved performance of GEMM kernels for small sizes by using a direct single-kernel implementation
- Fixed a bug in the tests and samples related to waiting for an invalid event
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa1e287e..aaac87f2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -150,7 +150,7 @@ endif()
set(KERNELS copy_fast copy_pad transpose_fast transpose_pad xaxpy xdot xger
xgemm xgemm_direct xgemv)
set(SAMPLE_PROGRAMS_CPP sgemm)
-set(SAMPLE_PROGRAMS_C sasum dgemv sgemm haxpy cache)
+set(SAMPLE_PROGRAMS_C sasum dgemv sgemm sgemm_netlib haxpy cache)
set(LEVEL1_ROUTINES xswap xscal xcopy xaxpy xdot xdotu xdotc xnrm2 xasum xamax)
set(LEVEL2_ROUTINES xgemv xgbmv xhemv xhbmv xhpmv xsymv xsbmv xspmv xtrmv xtbmv xtpmv
xger xgeru xgerc xher xhpr xher2 xhpr2 xsyr xspr xsyr2 xspr2)
diff --git a/README.md b/README.md
index 9b289448..20a320d3 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,10 @@ Or alternatively the plain C version:
#include <clblast_c.h>
+There is also a Netlib CBLAS C API available. This is however not recommended for full control over performance, since at every call it will copy all buffers to and from the OpenCL device. Especially for level 1 and level 2 BLAS functions performance will be impacted severly. However, it can be useful if you don't want to touch OpenCL at all. You can set the default device and platform by setting the `CLBLAST_DEVICE` and `CLBLAST_PLATFORM` environmental variables. This API can be used as follows:
+
+ #include <clblast_netlib_c.h>
+
Afterwards, any of CLBlast's routines can be called directly: there is no need to initialize the library. The available routines and the required arguments are described in the above mentioned include files and the included [API documentation](doc/clblast.md). Additionally, a couple of stand-alone example programs are included in the `samples` subfolder. They can optionally be compiled using the CMake infrastructure of CLBlast by providing the `-DSAMPLES=ON` flag, for example as follows:
cmake -DSAMPLES=ON ..
diff --git a/samples/sgemm_netlib.c b/samples/sgemm_netlib.c
new file mode 100644
index 00000000..0c8f76e9
--- /dev/null
+++ b/samples/sgemm_netlib.c
@@ -0,0 +1,69 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not
+// recommended if you want full control over performance: it will internally copy buffers from and
+// to the OpenCL device.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (Netlib CBLAS interface)
+#include <clblast_netlib_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SGEMM
+int main(void) {
+
+ // Example SGEMM arguments
+ const int m = 128;
+ const int n = 64;
+ const int k = 512;
+ const float alpha = 0.7f;
+ const float beta = 1.0f;
+ const int a_ld = k;
+ const int b_ld = n;
+ const int c_ld = n;
+
+ // Populate host matrices with some example data
+ float* host_a = (float*)malloc(sizeof(float)*m*k);
+ float* host_b = (float*)malloc(sizeof(float)*n*k);
+ float* host_c = (float*)malloc(sizeof(float)*m*n);
+ for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
+ for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
+ for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
+
+ // Call the SGEMM routine.
+ cblas_sgemm(CLBlastLayoutRowMajor,
+ CLBlastTransposeNo, CLBlastTransposeNo,
+ m, n, k,
+ alpha,
+ host_a, a_ld,
+ host_b, b_ld,
+ beta,
+ host_c, c_ld);
+
+ // Example completed
+ printf("Completed SGEMM\n");
+
+ // Clean-up
+ free(host_a);
+ free(host_b);
+ free(host_c);
+ return 0;
+}
+
+// =================================================================================================