Merge tag '1.5.1' into debian/sid

author: Gard Spreemann <gspr@nonempty.org> 2020-12-22 15:39:15 +0100
committer: Gard Spreemann <gspr@nonempty.org> 2020-12-22 15:39:15 +0100
commit: 7b1d3e5f0a1a36a469905e0b73d48cfea4d1bd46 (patch)
tree: e211fcdf8cee8d5841ef0dd7b41a89f542444ff7 /samples
parent: 6408c2fc41fa1b04d6abf470bafb9961a28c90cd (diff)
parent: 8433985051c0fb9758fd8dfe7d19cc8eaca630e1 (diff)
12 files changed, 1274 insertions, 0 deletions
diff --git a/samples/cache.c b/samples/cache.c
new file mode 100644
index 00000000..980c7cf3
--- /dev/null
+++ b/samples/cache.c
@@ -0,0 +1,138 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster
+// repeated kernel execution. The cache can be pre-initialized or cleared.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// Forward declaration
+void run_example_routine(const cl_device_id device);
+
+// =================================================================================================
+
+// Example use of the CLBlast kernel cache
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Run the routine multiple times in a row: after the first time the binary is already in the
+  // cache and compilation is no longer needed.
+  printf("Starting caching sample with an empty cache\n");
+  run_example_routine(device);
+  run_example_routine(device);
+  run_example_routine(device);
+
+  // Clearing the cache makes CLBlast re-compile the kernel once
+  printf("Clearing cache\n");
+  CLBlastClearCache();
+  run_example_routine(device);
+  run_example_routine(device);
+
+  // When the cache is empty, it can be pre-initialized with compiled kernels for all routines by
+  // calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from
+  // pre-compiled kernels and thus execute at maximum speed.
+  printf("Clearing cache\n");
+  CLBlastClearCache();
+  printf("Filling cache (this might take a while)\n");
+  CLBlastFillCache(device);
+  run_example_routine(device);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  return 0;
+}
+
+// =================================================================================================
+
+// Runs an example routine and reports the time
+void run_example_routine(const cl_device_id device) {
+
+  // Example SASUM arguments
+  const size_t n = 1024*128;
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  float* host_input = (float*)malloc(sizeof(float)*n);
+  float* host_output = (float*)malloc(sizeof(float)*1);
+  for (size_t i=0; i<n; ++i) { host_input[i] = -1.5f; }
+  for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
+
+  // Copy the data-structures to the device
+  cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+  cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
+
+  // Start the timer
+  clock_t start = clock();
+
+  // Calls an example routine
+  CLBlastStatusCode status = CLBlastSasum(n,
+                                          device_output, 0,
+                                          device_input, 0, 1,
+                                          &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Retrieves the execution time
+  clock_t diff = clock() - start;
+  double time_ms = diff * 1000.0f / (double)CLOCKS_PER_SEC;
+
+  // Routine completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed routine with status %d in %.3lf ms\n", status, time_ms);
+
+  // Clean-up
+  free(host_input);
+  free(host_output);
+  clReleaseMemObject(device_input);
+  clReleaseMemObject(device_output);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+}
+
+// =================================================================================================
diff --git a/samples/daxpy_cuda.cpp b/samples/daxpy_cuda.cpp
new file mode 100644
index 00000000..cead3f6d
--- /dev/null
+++ b/samples/daxpy_cuda.cpp
@@ -0,0 +1,88 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DAXPY routine with the C++ CUDA API of CLBlast.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+// Includes the CUDA driver API
+#include <cuda.h>
+
+// Includes the CLBlast library
+#include <clblast_cuda.h>
+
+// =================================================================================================
+
+// Example use of the double-precision Xaxpy routine DAXPY
+int main() {
+
+  // CUDA device selection
+  const auto device_id = 0;
+
+  // Example DAXPY arguments
+  const size_t n = 8192;
+  const double alpha = 0.7;
+
+  // Initializes the OpenCL device
+  cuInit(0);
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+
+  // Creates the OpenCL context and stream
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+  CUstream stream;
+  cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<double>(n);
+  auto host_b = std::vector<double>(n);
+  for (auto &item: host_a) { item = 12.193; }
+  for (auto &item: host_b) { item = -8.199; }
+
+  // Copy the matrices to the device
+  CUdeviceptr device_a;
+  CUdeviceptr device_b;
+  cuMemAlloc(&device_a, host_a.size()*sizeof(double));
+  cuMemAlloc(&device_b, host_b.size()*sizeof(double));
+  cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(double), stream);
+  cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(double), stream);
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the DAXPY routine. Note that the type of alpha (double) determines the precision.
+  const auto status = clblast::Axpy(n, alpha,
+                                    device_a, 0, 1,
+                                    device_b, 0, 1,
+                                    context, device);
+  cuStreamSynchronize(stream);
+
+  // Record the execution time
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast_cuda.h" for status codes (0 -> success).
+  printf("Completed DAXPY in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+
+  // Clean-up
+  cuMemFree(device_a);
+  cuMemFree(device_b);
+  cuStreamDestroy(stream);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/dgemv.c b/samples/dgemv.c
new file mode 100644
index 00000000..975cb7ac
--- /dev/null
+++ b/samples/dgemv.c
@@ -0,0 +1,111 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the double-precision routine DGEMV
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example DGEMV arguments
+  const size_t m = 128;
+  const size_t n = 289;
+  const double alpha = 0.7;
+  const double beta = 0.0;
+  const size_t a_ld = n;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  double* host_a = (double*)malloc(sizeof(double)*m*n);
+  double* host_x = (double*)malloc(sizeof(double)*n);
+  double* host_y = (double*)malloc(sizeof(double)*m);
+  for (size_t i=0; i<m*n; ++i) { host_a[i] = 12.193; }
+  for (size_t i=0; i<n; ++i) { host_x[i] = -8.199; }
+  for (size_t i=0; i<m; ++i) { host_y[i] = 0.0; }
+
+  // Copy the data-structures to the device
+  cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(double), NULL, NULL);
+  cl_mem device_x = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, NULL);
+  cl_mem device_y = clCreateBuffer(context, CL_MEM_READ_WRITE, m*sizeof(double), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*n*sizeof(double), host_a, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_x, CL_TRUE, 0, n*sizeof(double), host_x, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL);
+
+  // Call the DGEMV routine.
+  CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo,
+                                          m, n,
+                                          alpha,
+                                          device_a, 0, a_ld,
+                                          device_x, 0, 1,
+                                          beta,
+                                          device_y, 0, 1,
+                                          &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed DGEMV with status %d\n", status);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_a);
+  free(host_x);
+  free(host_y);
+  clReleaseMemObject(device_a);
+  clReleaseMemObject(device_x);
+  clReleaseMemObject(device_y);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/dtrsm.cpp b/samples/dtrsm.cpp
new file mode 100644
index 00000000..6d37dad7
--- /dev/null
+++ b/samples/dtrsm.cpp
@@ -0,0 +1,117 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the DTRSM routine. It is a stand-alone example, but it does
+// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
+// features, but CLBlast can also be used using the regular C-style OpenCL API.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <vector>
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the C++ OpenCL API. If not yet available, it can be found here:
+// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
+#include "cl.hpp"
+
+// Includes the CLBlast library
+#include <clblast.h>
+
+// =================================================================================================
+
+// Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the
+// result in the memory of matrix B. Uses row-major storage (C-style).
+int main() {
+
+  // OpenCL platform/device settings
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  // Example TRSM arguments
+  const size_t m = 4;
+  const size_t n = 3;
+  const double alpha = 1.0;
+  const auto a_ld = m;
+  const auto b_ld = n;
+
+  // Initializes the OpenCL platform
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
+  auto platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  auto devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
+  auto device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  auto device_as_vector = std::vector<cl::Device>{device};
+  auto context = cl::Context(device_as_vector);
+  auto queue = cl::CommandQueue(context, device);
+  auto event = cl_event{nullptr};
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<double>({1.0,  2.0,  1.0, -2.0,
+                                    0.0, -1.0, -2.0,  0.0,
+                                    0.0,  0.0,  1.0,  1.0,
+                                    0.0,  0.0,  0.0, -1.0});
+  auto host_b = std::vector<double>({-1.0, -1.0,  3.0,
+                                     1.0, -3.0,  2.0,
+                                     1.0,  1.0, -1.0,
+                                     4.0, -1.0, -2.0});
+  // Expected result:
+  //   8 -5  2
+  // -11  3  4
+  //   5  0 -3
+  //  -4  1  2
+
+  // Copy the matrices to the device
+  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double));
+  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double));
+  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data());
+  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());
+
+  // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision.
+  auto queue_plain = queue();
+  auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft,
+                              clblast::Triangle::kUpper, clblast::Transpose::kNo,
+                              clblast::Diagonal::kNonUnit,
+                              m, n,
+                              alpha,
+                              device_a(), 0, a_ld,
+                              device_b(), 0, b_ld,
+                              &queue_plain, &event);
+
+  // Retrieves the results
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+  queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());
+
+  // Example completed. See "clblast.h" for status codes (0 -> success).
+  printf("Completed TRSM with status %d and results:\n", static_cast<int>(status));
+  for (auto i = size_t{0}; i < m; ++i) {
+    for (auto j = size_t{0}; j < n; ++j) {
+      printf("%3.0f ", host_b[i * b_ld + j]);
+    }
+    printf("\n");
+  }
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/haxpy.c b/samples/haxpy.c
new file mode 100644
index 00000000..4f2bb400
--- /dev/null
+++ b/samples/haxpy.c
@@ -0,0 +1,110 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// Includes the float-to-half and half-to-float conversion utilities
+#include <clblast_half.h>
+
+// =================================================================================================
+
+// Example use of the half-precision routine HAXPY
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example HAXPY arguments
+  const size_t n = 8192;
+  const cl_half alpha = FloatToHalf(0.5f);
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host vectors with some example data
+  cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n);
+  cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n);
+  for (size_t i=0; i<n; ++i) { host_a[i] = FloatToHalf(2.2f); }
+  for (size_t i=0; i<n; ++i) { host_b[i] = FloatToHalf(0.4f); }
+  printf("Input values at index 0: alpha * a[0] + b[0] == %.3lf * %.3lf + %.3lf\n",
+         HalfToFloat(alpha), HalfToFloat(host_a[0]), HalfToFloat(host_b[0]));
+
+  // Copy the matrices to the device
+  cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
+  cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, n*sizeof(cl_half), host_a, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
+
+  // Call the HAXPY routine.
+  CLBlastStatusCode status = CLBlastHaxpy(n, alpha,
+                                          device_a, 0, 1,
+                                          device_b, 0, 1,
+                                          &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Copies the result back to the host
+  clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed HAXPY with status %d\n", status);
+
+  // Prints the first output value
+  if (status == 0) {
+    printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0]));
+  }
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_a);
+  free(host_b);
+  clReleaseMemObject(device_a);
+  clReleaseMemObject(device_b);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sasum.c b/samples/sasum.c
new file mode 100644
index 00000000..78377336
--- /dev/null
+++ b/samples/sasum.c
@@ -0,0 +1,101 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SASUM
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example SASUM arguments
+  const size_t n = 1000;
+  const float input_value = -1.5f;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host data structures with some example data
+  float* host_input = (float*)malloc(sizeof(float)*n);
+  float* host_output = (float*)malloc(sizeof(float)*1);
+  for (size_t i=0; i<n; ++i) { host_input[i] = input_value; }
+  for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; }
+
+  // Copy the data-structures to the device
+  cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL);
+  cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
+
+  // Call the SASUM routine.
+  CLBlastStatusCode status = CLBlastSasum(n,
+                                          device_output, 0,
+                                          device_input, 0, 1,
+                                          &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Copies the result back to the host
+  clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL);
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_input);
+  free(host_output);
+  clReleaseMemObject(device_input);
+  clReleaseMemObject(device_output);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm.c b/samples/sgemm.c
new file mode 100644
index 00000000..92f3057d
--- /dev/null
+++ b/samples/sgemm.c
@@ -0,0 +1,115 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SGEMM routine. It is pure C99 and demonstrates the use of
+// the C API to the CLBlast library.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the CLBlast library (C interface)
+#include <clblast_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SGEMM
+int main(void) {
+
+  // OpenCL platform/device settings
+  const size_t platform_id = 0;
+  const size_t device_id = 0;
+
+  // Example SGEMM arguments
+  const size_t m = 128;
+  const size_t n = 64;
+  const size_t k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const size_t a_ld = k;
+  const size_t b_ld = n;
+  const size_t c_ld = n;
+
+  // Initializes the OpenCL platform
+  cl_uint num_platforms;
+  clGetPlatformIDs(0, NULL, &num_platforms);
+  cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+  clGetPlatformIDs(num_platforms, platforms, NULL);
+  cl_platform_id platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  cl_uint num_devices;
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+  cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+  clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+  cl_device_id device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
+  cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
+  cl_event event = NULL;
+
+  // Populate host matrices with some example data
+  float* host_a = (float*)malloc(sizeof(float)*m*k);
+  float* host_b = (float*)malloc(sizeof(float)*n*k);
+  float* host_c = (float*)malloc(sizeof(float)*m*n);
+  for (size_t i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
+  for (size_t i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
+  for (size_t i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
+
+  // Copy the matrices to the device
+  cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*k*sizeof(float), NULL, NULL);
+  cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*k*sizeof(float), NULL, NULL);
+  cl_mem device_c = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(float), NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*k*sizeof(float), host_a, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*k*sizeof(float), host_b, 0, NULL, NULL);
+  clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL);
+
+  // Call the SGEMM routine.
+  CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor,
+                                          CLBlastTransposeNo, CLBlastTransposeNo,
+                                          m, n, k,
+                                          alpha,
+                                          device_a, 0, a_ld,
+                                          device_b, 0, b_ld,
+                                          beta,
+                                          device_c, 0, c_ld,
+                                          &queue, &event);
+
+  // Wait for completion
+  if (status == CLBlastSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+
+  // Example completed. See "clblast_c.h" for status codes (0 -> success).
+  printf("Completed SGEMM with status %d\n", status);
+
+  // Clean-up
+  free(platforms);
+  free(devices);
+  free(host_a);
+  free(host_b);
+  free(host_c);
+  clReleaseMemObject(device_a);
+  clReleaseMemObject(device_b);
+  clReleaseMemObject(device_c);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp
new file mode 100644
index 00000000..ab7858e1
--- /dev/null
+++ b/samples/sgemm.cpp
@@ -0,0 +1,114 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does
+// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
+// features, but CLBlast can also be used using the regular C-style OpenCL API.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the C++ OpenCL API. If not yet available, it can be found here:
+// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
+#include "cl.hpp"
+
+// Includes the CLBlast library
+#include <clblast.h>
+
+// =================================================================================================
+
+// Example use of the single-precision Xgemm routine SGEMM
+int main() {
+
+  // OpenCL platform/device settings
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  // Example SGEMM arguments
+  const size_t m = 128;
+  const size_t n = 64;
+  const size_t k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const auto a_ld = k;
+  const auto b_ld = n;
+  const auto c_ld = n;
+
+  // Initializes the OpenCL platform
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
+  auto platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  auto devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
+  auto device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  auto device_as_vector = std::vector<cl::Device>{device};
+  auto context = cl::Context(device_as_vector);
+  auto queue = cl::CommandQueue(context, device);
+  auto event = cl_event{nullptr};
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<float>(m*k);
+  auto host_b = std::vector<float>(n*k);
+  auto host_c = std::vector<float>(m*n);
+  for (auto &item: host_a) { item = 12.193f; }
+  for (auto &item: host_b) { item = -8.199f; }
+  for (auto &item: host_c) { item = 0.0f; }
+
+  // Copy the matrices to the device
+  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float));
+  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float));
+  auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float));
+  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data());
+  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data());
+  queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data());
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
+  auto queue_plain = queue();
+  auto status = clblast::Gemm(clblast::Layout::kRowMajor,
+                              clblast::Transpose::kNo, clblast::Transpose::kNo,
+                              m, n, k,
+                              alpha,
+                              device_a(), 0, a_ld,
+                              device_b(), 0, b_ld,
+                              beta,
+                              device_c(), 0, c_ld,
+                              &queue_plain, &event);
+
+  // Record the execution time
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast.h" for status codes (0 -> success).
+  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm_batched.cpp b/samples/sgemm_batched.cpp
new file mode 100644
index 00000000..32c465c7
--- /dev/null
+++ b/samples/sgemm_batched.cpp
@@ -0,0 +1,129 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the batched SGEMM routine. It is a stand-alone example, but it
+// does require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
+// features, but CLBlast can also be used using the regular C-style OpenCL API.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the C++ OpenCL API. If not yet available, it can be found here:
+// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
+#include "cl.hpp"
+
+// Includes the CLBlast library
+#include <clblast.h>
+
+// =================================================================================================
+
+// Example use of the single-precision batched SGEMM routine
+int main() {
+
+  // OpenCL platform/device settings
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  // Example arguments
+  const size_t batch_count = 261;
+  const size_t m = 1;
+  const size_t n = 1;
+  const size_t k = 40;
+  const auto a_ld = 2560;
+  const auto b_ld = 160;
+  const auto c_ld = 261;
+  std::vector<float> alphas(batch_count);
+  std::vector<float> betas(batch_count);
+  std::vector<size_t> a_offsets(batch_count);
+  std::vector<size_t> b_offsets(batch_count);
+  std::vector<size_t> c_offsets(batch_count);
+  for (auto b_id = size_t{0}; b_id < batch_count; ++b_id) {
+    alphas[b_id] = 1.0f;
+    betas[b_id] = 1.0f;
+    a_offsets[b_id] = 0;
+    b_offsets[b_id] = 0;
+    c_offsets[b_id] = b_id;
+  }
+  const auto a_size = a_ld * m;
+  const auto b_size = b_ld * k;
+  const auto c_size = c_ld * k;
+
+  // Initializes the OpenCL platform
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
+  auto platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  auto devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
+  auto device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  auto device_as_vector = std::vector<cl::Device>{device};
+  auto context = cl::Context(device_as_vector);
+  auto queue = cl::CommandQueue(context, device);
+  auto event = cl_event{nullptr};
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<float>(a_size);
+  auto host_b = std::vector<float>(b_size);
+  auto host_c = std::vector<float>(c_size);
+  for (auto &item: host_a) { item = 12.193f; }
+  for (auto &item: host_b) { item = -8.199f; }
+  for (auto &item: host_c) { item = 0.0f; }
+
+  // Copy the matrices to the device
+  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float));
+  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float));
+  auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float));
+  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data());
+  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data());
+  queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data());
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Calls the routine. Note that the type of alphas and betas (float) determine the precision.
+  auto queue_plain = queue();
+  auto status = clblast::GemmBatched(clblast::Layout::kRowMajor,
+                                     clblast::Transpose::kNo, clblast::Transpose::kNo,
+                                     m, n, k,
+                                     alphas.data(),
+                                     device_a(), a_offsets.data(), a_ld,
+                                     device_b(), b_offsets.data(), b_ld,
+                                     betas.data(),
+                                     device_c(), c_offsets.data(), c_ld,
+                                     batch_count,
+                                     &queue_plain, &event);
+
+  // Record the execution time
+  if (status == clblast::StatusCode::kSuccess) {
+    clWaitForEvents(1, &event);
+    clReleaseEvent(event);
+  }
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast.h" for status codes (0 -> success).
+  printf("Completed batched SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp
new file mode 100644
index 00000000..8e4397df
--- /dev/null
+++ b/samples/sgemm_cuda.cpp
@@ -0,0 +1,105 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the SGEMM routine with the C++ CUDA API of CLBlast.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+// Includes the CUDA driver API
+#include <cuda.h>
+
+// Includes the CLBlast library
+#include <clblast_cuda.h>
+
+// =================================================================================================
+
+// Example use of the single-precision Xgemm routine SGEMM
+int main() {
+
+  // CUDA device selection
+  const auto device_id = 0;
+
+  // Example SGEMM arguments
+  const size_t m = 128;
+  const size_t n = 64;
+  const size_t k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const auto a_ld = k;
+  const auto b_ld = n;
+  const auto c_ld = n;
+
+  // Initializes the OpenCL device
+  cuInit(0);
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+
+  // Creates the OpenCL context and stream
+  CUcontext context;
+  cuCtxCreate(&context, 0, device);
+  CUstream stream;
+  cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING);
+
+  // Populate host matrices with some example data
+  auto host_a = std::vector<float>(m*k);
+  auto host_b = std::vector<float>(n*k);
+  auto host_c = std::vector<float>(m*n);
+  for (auto &item: host_a) { item = 12.193f; }
+  for (auto &item: host_b) { item = -8.199f; }
+  for (auto &item: host_c) { item = 0.0f; }
+
+  // Copy the matrices to the device
+  CUdeviceptr device_a;
+  CUdeviceptr device_b;
+  CUdeviceptr device_c;
+  cuMemAlloc(&device_a, host_a.size()*sizeof(float));
+  cuMemAlloc(&device_b, host_b.size()*sizeof(float));
+  cuMemAlloc(&device_c, host_c.size()*sizeof(float));
+  cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream);
+  cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(float), stream);
+  cuMemcpyHtoDAsync(device_c, host_c.data(), host_c.size()*sizeof(float), stream);
+
+  // Start the timer
+  auto start_time = std::chrono::steady_clock::now();
+
+  // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision.
+  auto status = clblast::Gemm(clblast::Layout::kRowMajor,
+                              clblast::Transpose::kNo, clblast::Transpose::kNo,
+                              m, n, k,
+                              alpha,
+                              device_a, 0, a_ld,
+                              device_b, 0, b_ld,
+                              beta,
+                              device_c, 0, c_ld,
+                              context, device);
+  cuStreamSynchronize(stream);
+
+  // Record the execution time
+  auto elapsed_time = std::chrono::steady_clock::now() - start_time;
+  auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count();
+
+  // Example completed. See "clblast_cuda.h" for status codes (0 -> success).
+  printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status));
+
+  // Clean-up
+  cuMemFree(device_a);
+  cuMemFree(device_b);
+  cuMemFree(device_c);
+  cuStreamDestroy(stream);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/sgemm_netlib.c b/samples/sgemm_netlib.c
new file mode 100644
index 00000000..0c8f76e9
--- /dev/null
+++ b/samples/sgemm_netlib.c
@@ -0,0 +1,69 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not
+// recommended if you want full control over performance: it will internally copy buffers from and
+// to the OpenCL device.
+//
+// Note that this example is meant for illustration purposes only. CLBlast provides other programs
+// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
+//
+// =================================================================================================
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+// Includes the CLBlast library (Netlib CBLAS interface)
+#include <clblast_netlib_c.h>
+
+// =================================================================================================
+
+// Example use of the single-precision routine SGEMM
+int main(void) {
+
+  // Example SGEMM arguments
+  const int m = 128;
+  const int n = 64;
+  const int k = 512;
+  const float alpha = 0.7f;
+  const float beta = 1.0f;
+  const int a_ld = k;
+  const int b_ld = n;
+  const int c_ld = n;
+
+  // Populate host matrices with some example data
+  float* host_a = (float*)malloc(sizeof(float)*m*k);
+  float* host_b = (float*)malloc(sizeof(float)*n*k);
+  float* host_c = (float*)malloc(sizeof(float)*m*n);
+  for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; }
+  for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; }
+  for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; }
+
+  // Call the SGEMM routine.
+  cblas_sgemm(CLBlastLayoutRowMajor,
+              CLBlastTransposeNo, CLBlastTransposeNo,
+              m, n, k,
+              alpha,
+              host_a, a_ld,
+              host_b, b_ld,
+              beta,
+              host_c, c_ld);
+
+  // Example completed
+  printf("Completed SGEMM\n");
+
+  // Clean-up
+  free(host_a);
+  free(host_b);
+  free(host_c);
+  return 0;
+}
+
+// =================================================================================================
diff --git a/samples/tuning_api.cpp b/samples/tuning_api.cpp
new file mode 100644
index 00000000..f92b6909
--- /dev/null
+++ b/samples/tuning_api.cpp
@@ -0,0 +1,77 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+//   Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file demonstrates the use of the runtime tuning API. It is a stand-alone example, but it
+// does require the Khronos C++ OpenCL API header file (downloaded by CMake).
+//
+// =================================================================================================
+
+#include <cstdio>
+#include <chrono>
+#include <vector>
+
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings
+
+// Includes the C++ OpenCL API. If not yet available, it can be found here:
+// https://www.khronos.org/registry/cl/api/1.1/cl.hpp
+#include "cl.hpp"
+
+// Includes the CLBlast library
+#include <clblast.h>
+
+// =================================================================================================
+
+int main() {
+
+  // OpenCL platform/device settings
+  const auto platform_id = 0;
+  const auto device_id = 0;
+
+  // Example arguments
+  const size_t m = 128;
+  const size_t n = 64;
+  const auto fraction = 1.0; // between 0.0 and 1.0
+
+  // Initializes the OpenCL platform
+  auto platforms = std::vector<cl::Platform>();
+  cl::Platform::get(&platforms);
+  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
+  auto platform = platforms[platform_id];
+
+  // Initializes the OpenCL device
+  auto devices = std::vector<cl::Device>();
+  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
+  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
+  auto device = devices[device_id];
+
+  // Creates the OpenCL context, queue, and an event
+  auto device_as_vector = std::vector<cl::Device>{device};
+  auto context = cl::Context(device_as_vector);
+  auto queue = cl::CommandQueue(context, device);
+
+  // Performs the tuning
+  printf("Starting the tuning...\n");
+  std::unordered_map<std::string,size_t> parameters;
+  auto queue_plain = queue();
+  auto status = clblast::TuneCopy<float>(&queue_plain, m, n, fraction, parameters);
+
+  // Tuning completed. See "clblast.h" for status codes (0 -> success).
+  printf("Completed TuneCopy with status %d (0 == OK), found parameters:\n", static_cast<int>(status));
+  for (const auto parameter: parameters) {
+    printf(">  %s = %zu\n", parameter.first.c_str(), parameter.second);
+  }
+
+  // Set the new parameters
+  status = clblast::OverrideParameters(device(), "Copy", clblast::Precision::kSingle, parameters);
+  printf("Completed OverrideParameters with status %d (0 == OK)\n", static_cast<int>(status));
+  return 0;
+}
+
+// =================================================================================================
author	Gard Spreemann <gspr@nonempty.org>	2020-12-22 15:39:15 +0100
committer	Gard Spreemann <gspr@nonempty.org>	2020-12-22 15:39:15 +0100
commit	7b1d3e5f0a1a36a469905e0b73d48cfea4d1bd46 (patch)
tree	e211fcdf8cee8d5841ef0dd7b41a89f542444ff7 /samples
parent	6408c2fc41fa1b04d6abf470bafb9961a28c90cd (diff)
parent	8433985051c0fb9758fd8dfe7d19cc8eaca630e1 (diff)