diff options
author | Gard Spreemann <gspr@nonempty.org> | 2020-12-22 15:39:15 +0100 |
---|---|---|
committer | Gard Spreemann <gspr@nonempty.org> | 2020-12-22 15:39:15 +0100 |
commit | 7b1d3e5f0a1a36a469905e0b73d48cfea4d1bd46 (patch) | |
tree | e211fcdf8cee8d5841ef0dd7b41a89f542444ff7 /samples | |
parent | 6408c2fc41fa1b04d6abf470bafb9961a28c90cd (diff) | |
parent | 8433985051c0fb9758fd8dfe7d19cc8eaca630e1 (diff) |
Merge tag '1.5.1' into debian/sid
Diffstat (limited to 'samples')
-rw-r--r-- | samples/cache.c | 138 | ||||
-rw-r--r-- | samples/daxpy_cuda.cpp | 88 | ||||
-rw-r--r-- | samples/dgemv.c | 111 | ||||
-rw-r--r-- | samples/dtrsm.cpp | 117 | ||||
-rw-r--r-- | samples/haxpy.c | 110 | ||||
-rw-r--r-- | samples/sasum.c | 101 | ||||
-rw-r--r-- | samples/sgemm.c | 115 | ||||
-rw-r--r-- | samples/sgemm.cpp | 114 | ||||
-rw-r--r-- | samples/sgemm_batched.cpp | 129 | ||||
-rw-r--r-- | samples/sgemm_cuda.cpp | 105 | ||||
-rw-r--r-- | samples/sgemm_netlib.c | 69 | ||||
-rw-r--r-- | samples/tuning_api.cpp | 77 |
12 files changed, 1274 insertions, 0 deletions
diff --git a/samples/cache.c b/samples/cache.c new file mode 100644 index 00000000..980c7cf3 --- /dev/null +++ b/samples/cache.c @@ -0,0 +1,138 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the CLBlast kernel cache, which stores compiled OpenCL binaries for faster +// repeated kernel execution. The cache can be pre-initialized or cleared. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <time.h> + +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the CLBlast library (C interface) +#include <clblast_c.h> + +// Forward declaration +void run_example_routine(const cl_device_id device); + +// ================================================================================================= + +// Example use of the CLBlast kernel cache +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Run the routine multiple times in a row: after the first time the binary is already in the + // cache and compilation is no longer needed. + printf("Starting caching sample with an empty cache\n"); + run_example_routine(device); + run_example_routine(device); + run_example_routine(device); + + // Clearing the cache makes CLBlast re-compile the kernel once + printf("Clearing cache\n"); + CLBlastClearCache(); + run_example_routine(device); + run_example_routine(device); + + // When the cache is empty, it can be pre-initialized with compiled kernels for all routines by + // calling the CLBlastFillCache function, such that all other CLBlast calls can benefit from + // pre-compiled kernels and thus execute at maximum speed. + printf("Clearing cache\n"); + CLBlastClearCache(); + printf("Filling cache (this might take a while)\n"); + CLBlastFillCache(device); + run_example_routine(device); + + // Clean-up + free(platforms); + free(devices); + return 0; +} + +// ================================================================================================= + +// Runs an example routine and reports the time +void run_example_routine(const cl_device_id device) { + + // Example SASUM arguments + const size_t n = 1024*128; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + float* host_input = (float*)malloc(sizeof(float)*n); + float* host_output = (float*)malloc(sizeof(float)*1); + for (size_t i=0; i<n; ++i) { host_input[i] = -1.5f; } + for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; } + + // Copy the data-structures to the device + cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL); + cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL); + clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL); + + // Start the timer + clock_t start = clock(); + + // Calls an example routine + CLBlastStatusCode status = CLBlastSasum(n, + device_output, 0, + device_input, 0, 1, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + // Retrieves the execution time + clock_t diff = clock() - start; + double time_ms = diff * 1000.0f / (double)CLOCKS_PER_SEC; + + // Routine completed. See "clblast_c.h" for status codes (0 -> success). + printf("Completed routine with status %d in %.3lf ms\n", status, time_ms); + + // Clean-up + free(host_input); + free(host_output); + clReleaseMemObject(device_input); + clReleaseMemObject(device_output); + clReleaseCommandQueue(queue); + clReleaseContext(context); +} + +// ================================================================================================= diff --git a/samples/daxpy_cuda.cpp b/samples/daxpy_cuda.cpp new file mode 100644 index 00000000..cead3f6d --- /dev/null +++ b/samples/daxpy_cuda.cpp @@ -0,0 +1,88 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the DAXPY routine with the C++ CUDA API of CLBlast. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <cstdio> +#include <chrono> +#include <vector> + +// Includes the CUDA driver API +#include <cuda.h> + +// Includes the CLBlast library +#include <clblast_cuda.h> + +// ================================================================================================= + +// Example use of the double-precision Xaxpy routine DAXPY +int main() { + + // CUDA device selection + const auto device_id = 0; + + // Example DAXPY arguments + const size_t n = 8192; + const double alpha = 0.7; + + // Initializes the OpenCL device + cuInit(0); + CUdevice device; + cuDeviceGet(&device, device_id); + + // Creates the OpenCL context and stream + CUcontext context; + cuCtxCreate(&context, 0, device); + CUstream stream; + cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); + + // Populate host matrices with some example data + auto host_a = std::vector<double>(n); + auto host_b = std::vector<double>(n); + for (auto &item: host_a) { item = 12.193; } + for (auto &item: host_b) { item = -8.199; } + + // Copy the matrices to the device + CUdeviceptr device_a; + CUdeviceptr device_b; + cuMemAlloc(&device_a, host_a.size()*sizeof(double)); + cuMemAlloc(&device_b, host_b.size()*sizeof(double)); + cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(double), stream); + cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(double), stream); + + // Start the timer + auto start_time = std::chrono::steady_clock::now(); + + // Call the DAXPY routine. Note that the type of alpha (double) determines the precision. + const auto status = clblast::Axpy(n, alpha, + device_a, 0, 1, + device_b, 0, 1, + context, device); + cuStreamSynchronize(stream); + + // Record the execution time + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); + + // Example completed. See "clblast_cuda.h" for status codes (0 -> success). + printf("Completed DAXPY in %.3lf ms with status %d\n", time_ms, static_cast<int>(status)); + + // Clean-up + cuMemFree(device_a); + cuMemFree(device_b); + cuStreamDestroy(stream); + return 0; +} + +// ================================================================================================= diff --git a/samples/dgemv.c b/samples/dgemv.c new file mode 100644 index 00000000..975cb7ac --- /dev/null +++ b/samples/dgemv.c @@ -0,0 +1,111 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the DGEMV routine. It is pure C99 and demonstrates the use of +// the C API to the CLBlast library. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the CLBlast library (C interface) +#include <clblast_c.h> + +// ================================================================================================= + +// Example use of the double-precision routine DGEMV +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example DGEMV arguments + const size_t m = 128; + const size_t n = 289; + const double alpha = 0.7; + const double beta = 0.0; + const size_t a_ld = n; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + double* host_a = (double*)malloc(sizeof(double)*m*n); + double* host_x = (double*)malloc(sizeof(double)*n); + double* host_y = (double*)malloc(sizeof(double)*m); + for (size_t i=0; i<m*n; ++i) { host_a[i] = 12.193; } + for (size_t i=0; i<n; ++i) { host_x[i] = -8.199; } + for (size_t i=0; i<m; ++i) { host_y[i] = 0.0; } + + // Copy the data-structures to the device + cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(double), NULL, NULL); + cl_mem device_x = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(double), NULL, NULL); + cl_mem device_y = clCreateBuffer(context, CL_MEM_READ_WRITE, m*sizeof(double), NULL, NULL); + clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*n*sizeof(double), host_a, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_x, CL_TRUE, 0, n*sizeof(double), host_x, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_y, CL_TRUE, 0, m*sizeof(double), host_y, 0, NULL, NULL); + + // Call the DGEMV routine. + CLBlastStatusCode status = CLBlastDgemv(CLBlastLayoutRowMajor, CLBlastTransposeNo, + m, n, + alpha, + device_a, 0, a_ld, + device_x, 0, 1, + beta, + device_y, 0, 1, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + // Example completed. See "clblast_c.h" for status codes (0 -> success). + printf("Completed DGEMV with status %d\n", status); + + // Clean-up + free(platforms); + free(devices); + free(host_a); + free(host_x); + free(host_y); + clReleaseMemObject(device_a); + clReleaseMemObject(device_x); + clReleaseMemObject(device_y); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/samples/dtrsm.cpp b/samples/dtrsm.cpp new file mode 100644 index 00000000..6d37dad7 --- /dev/null +++ b/samples/dtrsm.cpp @@ -0,0 +1,117 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the DTRSM routine. It is a stand-alone example, but it does +// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ +// features, but CLBlast can also be used using the regular C-style OpenCL API. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <cstdio> +#include <vector> + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the C++ OpenCL API. If not yet available, it can be found here: +// https://www.khronos.org/registry/cl/api/1.1/cl.hpp +#include "cl.hpp" + +// Includes the CLBlast library +#include <clblast.h> + +// ================================================================================================= + +// Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the +// result in the memory of matrix B. Uses row-major storage (C-style). +int main() { + + // OpenCL platform/device settings + const auto platform_id = 0; + const auto device_id = 0; + + // Example TRSM arguments + const size_t m = 4; + const size_t n = 3; + const double alpha = 1.0; + const auto a_ld = m; + const auto b_ld = n; + + // Initializes the OpenCL platform + auto platforms = std::vector<cl::Platform>(); + cl::Platform::get(&platforms); + if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } + auto platform = platforms[platform_id]; + + // Initializes the OpenCL device + auto devices = std::vector<cl::Device>(); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0 || device_id >= devices.size()) { return 1; } + auto device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + auto device_as_vector = std::vector<cl::Device>{device}; + auto context = cl::Context(device_as_vector); + auto queue = cl::CommandQueue(context, device); + auto event = cl_event{nullptr}; + + // Populate host matrices with some example data + auto host_a = std::vector<double>({1.0, 2.0, 1.0, -2.0, + 0.0, -1.0, -2.0, 0.0, + 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 0.0, -1.0}); + auto host_b = std::vector<double>({-1.0, -1.0, 3.0, + 1.0, -3.0, 2.0, + 1.0, 1.0, -1.0, + 4.0, -1.0, -2.0}); + // Expected result: + // 8 -5 2 + // -11 3 4 + // 5 0 -3 + // -4 1 2 + + // Copy the matrices to the device + auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double)); + auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double)); + queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data()); + queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data()); + + // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision. + auto queue_plain = queue(); + auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft, + clblast::Triangle::kUpper, clblast::Transpose::kNo, + clblast::Diagonal::kNonUnit, + m, n, + alpha, + device_a(), 0, a_ld, + device_b(), 0, b_ld, + &queue_plain, &event); + + // Retrieves the results + if (status == clblast::StatusCode::kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data()); + + // Example completed. See "clblast.h" for status codes (0 -> success). + printf("Completed TRSM with status %d and results:\n", static_cast<int>(status)); + for (auto i = size_t{0}; i < m; ++i) { + for (auto j = size_t{0}; j < n; ++j) { + printf("%3.0f ", host_b[i * b_ld + j]); + } + printf("\n"); + } + return 0; +} + +// ================================================================================================= diff --git a/samples/haxpy.c b/samples/haxpy.c new file mode 100644 index 00000000..4f2bb400 --- /dev/null +++ b/samples/haxpy.c @@ -0,0 +1,110 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the HAXPY routine. It demonstrates the use of half-precision. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the CLBlast library (C interface) +#include <clblast_c.h> + +// Includes the float-to-half and half-to-float conversion utilities +#include <clblast_half.h> + +// ================================================================================================= + +// Example use of the half-precision routine HAXPY +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example HAXPY arguments + const size_t n = 8192; + const cl_half alpha = FloatToHalf(0.5f); + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host vectors with some example data + cl_half* host_a = (cl_half*)malloc(sizeof(cl_half)*n); + cl_half* host_b = (cl_half*)malloc(sizeof(cl_half)*n); + for (size_t i=0; i<n; ++i) { host_a[i] = FloatToHalf(2.2f); } + for (size_t i=0; i<n; ++i) { host_b[i] = FloatToHalf(0.4f); } + printf("Input values at index 0: alpha * a[0] + b[0] == %.3lf * %.3lf + %.3lf\n", + HalfToFloat(alpha), HalfToFloat(host_a[0]), HalfToFloat(host_b[0])); + + // Copy the matrices to the device + cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL); + cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(cl_half), NULL, NULL); + clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, n*sizeof(cl_half), host_a, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL); + + // Call the HAXPY routine. + CLBlastStatusCode status = CLBlastHaxpy(n, alpha, + device_a, 0, 1, + device_b, 0, 1, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + // Copies the result back to the host + clEnqueueReadBuffer(queue, device_b, CL_TRUE, 0, n*sizeof(cl_half), host_b, 0, NULL, NULL); + + // Example completed. See "clblast_c.h" for status codes (0 -> success). + printf("Completed HAXPY with status %d\n", status); + + // Prints the first output value + if (status == 0) { + printf("Output value at index 0: b[0] = %.3lf\n", HalfToFloat(host_b[0])); + } + + // Clean-up + free(platforms); + free(devices); + free(host_a); + free(host_b); + clReleaseMemObject(device_a); + clReleaseMemObject(device_b); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/samples/sasum.c b/samples/sasum.c new file mode 100644 index 00000000..78377336 --- /dev/null +++ b/samples/sasum.c @@ -0,0 +1,101 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the SASUM routine. It is pure C99 and demonstrates the use of +// the C API to the CLBlast library. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the CLBlast library (C interface) +#include <clblast_c.h> + +// ================================================================================================= + +// Example use of the single-precision routine SASUM +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example SASUM arguments + const size_t n = 1000; + const float input_value = -1.5f; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host data structures with some example data + float* host_input = (float*)malloc(sizeof(float)*n); + float* host_output = (float*)malloc(sizeof(float)*1); + for (size_t i=0; i<n; ++i) { host_input[i] = input_value; } + for (size_t i=0; i<1; ++i) { host_output[i] = 0.0f; } + + // Copy the data-structures to the device + cl_mem device_input = clCreateBuffer(context, CL_MEM_READ_WRITE, n*sizeof(float), NULL, NULL); + cl_mem device_output = clCreateBuffer(context, CL_MEM_READ_WRITE, 1*sizeof(float), NULL, NULL); + clEnqueueWriteBuffer(queue, device_input, CL_TRUE, 0, n*sizeof(float), host_input, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL); + + // Call the SASUM routine. + CLBlastStatusCode status = CLBlastSasum(n, + device_output, 0, + device_input, 0, 1, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + // Copies the result back to the host + clEnqueueReadBuffer(queue, device_output, CL_TRUE, 0, 1*sizeof(float), host_output, 0, NULL, NULL); + + // Example completed. See "clblast_c.h" for status codes (0 -> success). + printf("Completed SASUM with status %d: %zu * |%.1lf| = %.1lf\n", status, n, input_value, host_output[0]); + + // Clean-up + free(platforms); + free(devices); + free(host_input); + free(host_output); + clReleaseMemObject(device_input); + clReleaseMemObject(device_output); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm.c b/samples/sgemm.c new file mode 100644 index 00000000..92f3057d --- /dev/null +++ b/samples/sgemm.c @@ -0,0 +1,115 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the SGEMM routine. It is pure C99 and demonstrates the use of +// the C API to the CLBlast library. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the CLBlast library (C interface) +#include <clblast_c.h> + +// ================================================================================================= + +// Example use of the single-precision routine SGEMM +int main(void) { + + // OpenCL platform/device settings + const size_t platform_id = 0; + const size_t device_id = 0; + + // Example SGEMM arguments + const size_t m = 128; + const size_t n = 64; + const size_t k = 512; + const float alpha = 0.7f; + const float beta = 1.0f; + const size_t a_ld = k; + const size_t b_ld = n; + const size_t c_ld = n; + + // Initializes the OpenCL platform + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, platforms, NULL); + cl_platform_id platform = platforms[platform_id]; + + // Initializes the OpenCL device + cl_uint num_devices; + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); + cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id)); + clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); + cl_device_id device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); + cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); + cl_event event = NULL; + + // Populate host matrices with some example data + float* host_a = (float*)malloc(sizeof(float)*m*k); + float* host_b = (float*)malloc(sizeof(float)*n*k); + float* host_c = (float*)malloc(sizeof(float)*m*n); + for (size_t i=0; i<m*k; ++i) { host_a[i] = 12.193f; } + for (size_t i=0; i<n*k; ++i) { host_b[i] = -8.199f; } + for (size_t i=0; i<m*n; ++i) { host_c[i] = 0.0f; } + + // Copy the matrices to the device + cl_mem device_a = clCreateBuffer(context, CL_MEM_READ_WRITE, m*k*sizeof(float), NULL, NULL); + cl_mem device_b = clCreateBuffer(context, CL_MEM_READ_WRITE, n*k*sizeof(float), NULL, NULL); + cl_mem device_c = clCreateBuffer(context, CL_MEM_READ_WRITE, m*n*sizeof(float), NULL, NULL); + clEnqueueWriteBuffer(queue, device_a, CL_TRUE, 0, m*k*sizeof(float), host_a, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_b, CL_TRUE, 0, n*k*sizeof(float), host_b, 0, NULL, NULL); + clEnqueueWriteBuffer(queue, device_c, CL_TRUE, 0, m*n*sizeof(float), host_c, 0, NULL, NULL); + + // Call the SGEMM routine. + CLBlastStatusCode status = CLBlastSgemm(CLBlastLayoutRowMajor, + CLBlastTransposeNo, CLBlastTransposeNo, + m, n, k, + alpha, + device_a, 0, a_ld, + device_b, 0, b_ld, + beta, + device_c, 0, c_ld, + &queue, &event); + + // Wait for completion + if (status == CLBlastSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + + // Example completed. See "clblast_c.h" for status codes (0 -> success). + printf("Completed SGEMM with status %d\n", status); + + // Clean-up + free(platforms); + free(devices); + free(host_a); + free(host_b); + free(host_c); + clReleaseMemObject(device_a); + clReleaseMemObject(device_b); + clReleaseMemObject(device_c); + clReleaseCommandQueue(queue); + clReleaseContext(context); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm.cpp b/samples/sgemm.cpp new file mode 100644 index 00000000..ab7858e1 --- /dev/null +++ b/samples/sgemm.cpp @@ -0,0 +1,114 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the SGEMM routine. It is a stand-alone example, but it does +// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ +// features, but CLBlast can also be used using the regular C-style OpenCL API. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <cstdio> +#include <chrono> +#include <vector> + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the C++ OpenCL API. If not yet available, it can be found here: +// https://www.khronos.org/registry/cl/api/1.1/cl.hpp +#include "cl.hpp" + +// Includes the CLBlast library +#include <clblast.h> + +// ================================================================================================= + +// Example use of the single-precision Xgemm routine SGEMM +int main() { + + // OpenCL platform/device settings + const auto platform_id = 0; + const auto device_id = 0; + + // Example SGEMM arguments + const size_t m = 128; + const size_t n = 64; + const size_t k = 512; + const float alpha = 0.7f; + const float beta = 1.0f; + const auto a_ld = k; + const auto b_ld = n; + const auto c_ld = n; + + // Initializes the OpenCL platform + auto platforms = std::vector<cl::Platform>(); + cl::Platform::get(&platforms); + if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } + auto platform = platforms[platform_id]; + + // Initializes the OpenCL device + auto devices = std::vector<cl::Device>(); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0 || device_id >= devices.size()) { return 1; } + auto device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + auto device_as_vector = std::vector<cl::Device>{device}; + auto context = cl::Context(device_as_vector); + auto queue = cl::CommandQueue(context, device); + auto event = cl_event{nullptr}; + + // Populate host matrices with some example data + auto host_a = std::vector<float>(m*k); + auto host_b = std::vector<float>(n*k); + auto host_c = std::vector<float>(m*n); + for (auto &item: host_a) { item = 12.193f; } + for (auto &item: host_b) { item = -8.199f; } + for (auto &item: host_c) { item = 0.0f; } + + // Copy the matrices to the device + auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float)); + auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float)); + auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float)); + queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data()); + queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data()); + queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data()); + + // Start the timer + auto start_time = std::chrono::steady_clock::now(); + + // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. + auto queue_plain = queue(); + auto status = clblast::Gemm(clblast::Layout::kRowMajor, + clblast::Transpose::kNo, clblast::Transpose::kNo, + m, n, k, + alpha, + device_a(), 0, a_ld, + device_b(), 0, b_ld, + beta, + device_c(), 0, c_ld, + &queue_plain, &event); + + // Record the execution time + if (status == clblast::StatusCode::kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); + + // Example completed. See "clblast.h" for status codes (0 -> success). + printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status)); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm_batched.cpp b/samples/sgemm_batched.cpp new file mode 100644 index 00000000..32c465c7 --- /dev/null +++ b/samples/sgemm_batched.cpp @@ -0,0 +1,129 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the batched SGEMM routine. It is a stand-alone example, but it +// does require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++ +// features, but CLBlast can also be used using the regular C-style OpenCL API. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <cstdio> +#include <chrono> +#include <vector> + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the C++ OpenCL API. If not yet available, it can be found here: +// https://www.khronos.org/registry/cl/api/1.1/cl.hpp +#include "cl.hpp" + +// Includes the CLBlast library +#include <clblast.h> + +// ================================================================================================= + +// Example use of the single-precision batched SGEMM routine +int main() { + + // OpenCL platform/device settings + const auto platform_id = 0; + const auto device_id = 0; + + // Example arguments + const size_t batch_count = 261; + const size_t m = 1; + const size_t n = 1; + const size_t k = 40; + const auto a_ld = 2560; + const auto b_ld = 160; + const auto c_ld = 261; + std::vector<float> alphas(batch_count); + std::vector<float> betas(batch_count); + std::vector<size_t> a_offsets(batch_count); + std::vector<size_t> b_offsets(batch_count); + std::vector<size_t> c_offsets(batch_count); + for (auto b_id = size_t{0}; b_id < batch_count; ++b_id) { + alphas[b_id] = 1.0f; + betas[b_id] = 1.0f; + a_offsets[b_id] = 0; + b_offsets[b_id] = 0; + c_offsets[b_id] = b_id; + } + const auto a_size = a_ld * m; + const auto b_size = b_ld * k; + const auto c_size = c_ld * k; + + // Initializes the OpenCL platform + auto platforms = std::vector<cl::Platform>(); + cl::Platform::get(&platforms); + if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } + auto platform = platforms[platform_id]; + + // Initializes the OpenCL device + auto devices = std::vector<cl::Device>(); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0 || device_id >= devices.size()) { return 1; } + auto device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + auto device_as_vector = std::vector<cl::Device>{device}; + auto context = cl::Context(device_as_vector); + auto queue = cl::CommandQueue(context, device); + auto event = cl_event{nullptr}; + + // Populate host matrices with some example data + auto host_a = std::vector<float>(a_size); + auto host_b = std::vector<float>(b_size); + auto host_c = std::vector<float>(c_size); + for (auto &item: host_a) { item = 12.193f; } + for (auto &item: host_b) { item = -8.199f; } + for (auto &item: host_c) { item = 0.0f; } + + // Copy the matrices to the device + auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(float)); + auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(float)); + auto device_c = cl::Buffer(context, CL_MEM_READ_WRITE, host_c.size()*sizeof(float)); + queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(float), host_a.data()); + queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(float), host_b.data()); + queue.enqueueWriteBuffer(device_c, CL_TRUE, 0, host_c.size()*sizeof(float), host_c.data()); + + // Start the timer + auto start_time = std::chrono::steady_clock::now(); + + // Calls the routine. Note that the type of alphas and betas (float) determine the precision. + auto queue_plain = queue(); + auto status = clblast::GemmBatched(clblast::Layout::kRowMajor, + clblast::Transpose::kNo, clblast::Transpose::kNo, + m, n, k, + alphas.data(), + device_a(), a_offsets.data(), a_ld, + device_b(), b_offsets.data(), b_ld, + betas.data(), + device_c(), c_offsets.data(), c_ld, + batch_count, + &queue_plain, &event); + + // Record the execution time + if (status == clblast::StatusCode::kSuccess) { + clWaitForEvents(1, &event); + clReleaseEvent(event); + } + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); + + // Example completed. See "clblast.h" for status codes (0 -> success). + printf("Completed batched SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status)); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm_cuda.cpp b/samples/sgemm_cuda.cpp new file mode 100644 index 00000000..8e4397df --- /dev/null +++ b/samples/sgemm_cuda.cpp @@ -0,0 +1,105 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the SGEMM routine with the C++ CUDA API of CLBlast. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <cstdio> +#include <chrono> +#include <vector> + +// Includes the CUDA driver API +#include <cuda.h> + +// Includes the CLBlast library +#include <clblast_cuda.h> + +// ================================================================================================= + +// Example use of the single-precision Xgemm routine SGEMM +int main() { + + // CUDA device selection + const auto device_id = 0; + + // Example SGEMM arguments + const size_t m = 128; + const size_t n = 64; + const size_t k = 512; + const float alpha = 0.7f; + const float beta = 1.0f; + const auto a_ld = k; + const auto b_ld = n; + const auto c_ld = n; + + // Initializes the OpenCL device + cuInit(0); + CUdevice device; + cuDeviceGet(&device, device_id); + + // Creates the OpenCL context and stream + CUcontext context; + cuCtxCreate(&context, 0, device); + CUstream stream; + cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING); + + // Populate host matrices with some example data + auto host_a = std::vector<float>(m*k); + auto host_b = std::vector<float>(n*k); + auto host_c = std::vector<float>(m*n); + for (auto &item: host_a) { item = 12.193f; } + for (auto &item: host_b) { item = -8.199f; } + for (auto &item: host_c) { item = 0.0f; } + + // Copy the matrices to the device + CUdeviceptr device_a; + CUdeviceptr device_b; + CUdeviceptr device_c; + cuMemAlloc(&device_a, host_a.size()*sizeof(float)); + cuMemAlloc(&device_b, host_b.size()*sizeof(float)); + cuMemAlloc(&device_c, host_c.size()*sizeof(float)); + cuMemcpyHtoDAsync(device_a, host_a.data(), host_a.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_b, host_b.data(), host_b.size()*sizeof(float), stream); + cuMemcpyHtoDAsync(device_c, host_c.data(), host_c.size()*sizeof(float), stream); + + // Start the timer + auto start_time = std::chrono::steady_clock::now(); + + // Call the SGEMM routine. Note that the type of alpha and beta (float) determine the precision. + auto status = clblast::Gemm(clblast::Layout::kRowMajor, + clblast::Transpose::kNo, clblast::Transpose::kNo, + m, n, k, + alpha, + device_a, 0, a_ld, + device_b, 0, b_ld, + beta, + device_c, 0, c_ld, + context, device); + cuStreamSynchronize(stream); + + // Record the execution time + auto elapsed_time = std::chrono::steady_clock::now() - start_time; + auto time_ms = std::chrono::duration<double,std::milli>(elapsed_time).count(); + + // Example completed. See "clblast_cuda.h" for status codes (0 -> success). + printf("Completed SGEMM in %.3lf ms with status %d\n", time_ms, static_cast<int>(status)); + + // Clean-up + cuMemFree(device_a); + cuMemFree(device_b); + cuMemFree(device_c); + cuStreamDestroy(stream); + return 0; +} + +// ================================================================================================= diff --git a/samples/sgemm_netlib.c b/samples/sgemm_netlib.c new file mode 100644 index 00000000..0c8f76e9 --- /dev/null +++ b/samples/sgemm_netlib.c @@ -0,0 +1,69 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the Netlib CBLAS API of the CLBlast library. This API is not +// recommended if you want full control over performance: it will internally copy buffers from and +// to the OpenCL device. +// +// Note that this example is meant for illustration purposes only. CLBlast provides other programs +// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx'). +// +// ================================================================================================= + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +// Includes the CLBlast library (Netlib CBLAS interface) +#include <clblast_netlib_c.h> + +// ================================================================================================= + +// Example use of the single-precision routine SGEMM +int main(void) { + + // Example SGEMM arguments + const int m = 128; + const int n = 64; + const int k = 512; + const float alpha = 0.7f; + const float beta = 1.0f; + const int a_ld = k; + const int b_ld = n; + const int c_ld = n; + + // Populate host matrices with some example data + float* host_a = (float*)malloc(sizeof(float)*m*k); + float* host_b = (float*)malloc(sizeof(float)*n*k); + float* host_c = (float*)malloc(sizeof(float)*m*n); + for (int i=0; i<m*k; ++i) { host_a[i] = 12.193f; } + for (int i=0; i<n*k; ++i) { host_b[i] = -8.199f; } + for (int i=0; i<m*n; ++i) { host_c[i] = 0.0f; } + + // Call the SGEMM routine. + cblas_sgemm(CLBlastLayoutRowMajor, + CLBlastTransposeNo, CLBlastTransposeNo, + m, n, k, + alpha, + host_a, a_ld, + host_b, b_ld, + beta, + host_c, c_ld); + + // Example completed + printf("Completed SGEMM\n"); + + // Clean-up + free(host_a); + free(host_b); + free(host_c); + return 0; +} + +// ================================================================================================= diff --git a/samples/tuning_api.cpp b/samples/tuning_api.cpp new file mode 100644 index 00000000..f92b6909 --- /dev/null +++ b/samples/tuning_api.cpp @@ -0,0 +1,77 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file demonstrates the use of the runtime tuning API. It is a stand-alone example, but it +// does require the Khronos C++ OpenCL API header file (downloaded by CMake). +// +// ================================================================================================= + +#include <cstdio> +#include <chrono> +#include <vector> + +#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings + +// Includes the C++ OpenCL API. If not yet available, it can be found here: +// https://www.khronos.org/registry/cl/api/1.1/cl.hpp +#include "cl.hpp" + +// Includes the CLBlast library +#include <clblast.h> + +// ================================================================================================= + +int main() { + + // OpenCL platform/device settings + const auto platform_id = 0; + const auto device_id = 0; + + // Example arguments + const size_t m = 128; + const size_t n = 64; + const auto fraction = 1.0; // between 0.0 and 1.0 + + // Initializes the OpenCL platform + auto platforms = std::vector<cl::Platform>(); + cl::Platform::get(&platforms); + if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } + auto platform = platforms[platform_id]; + + // Initializes the OpenCL device + auto devices = std::vector<cl::Device>(); + platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); + if (devices.size() == 0 || device_id >= devices.size()) { return 1; } + auto device = devices[device_id]; + + // Creates the OpenCL context, queue, and an event + auto device_as_vector = std::vector<cl::Device>{device}; + auto context = cl::Context(device_as_vector); + auto queue = cl::CommandQueue(context, device); + + // Performs the tuning + printf("Starting the tuning...\n"); + std::unordered_map<std::string,size_t> parameters; + auto queue_plain = queue(); + auto status = clblast::TuneCopy<float>(&queue_plain, m, n, fraction, parameters); + + // Tuning completed. See "clblast.h" for status codes (0 -> success). + printf("Completed TuneCopy with status %d (0 == OK), found parameters:\n", static_cast<int>(status)); + for (const auto parameter: parameters) { + printf("> %s = %zu\n", parameter.first.c_str(), parameter.second); + } + + // Set the new parameters + status = clblast::OverrideParameters(device(), "Copy", clblast::Precision::kSingle, parameters); + printf("Completed OverrideParameters with status %d (0 == OK)\n", static_cast<int>(status)); + return 0; +} + +// ================================================================================================= |