summaryrefslogtreecommitdiff
path: root/samples/dtrsm.cpp
blob: e207e5d3a35a691e483ebee13901e09ffc6a244f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// =================================================================================================
// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
// width of 100 characters per line.
//
// Author(s):
//   Cedric Nugteren <www.cedricnugteren.nl>
//
// This file demonstrates the use of the DTRSM routine. It is a stand-alone example, but it does
// require the Khronos C++ OpenCL API header file (downloaded by CMake). The example uses C++
// features, but CLBlast can also be used using the regular C-style OpenCL API.
//
// Note that this example is meant for illustration purposes only. CLBlast provides other programs
// for performance benchmarking ('client_xxxxx') and for correctness testing ('test_xxxxx').
//
// =================================================================================================

#include <cstdio>
#include <vector>

#define CL_USE_DEPRECATED_OPENCL_1_1_APIS // to disable deprecation warnings
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS // to disable deprecation warnings

// Includes the C++ OpenCL API. If not yet available, it can be found here:
// https://raw.githubusercontent.com/KhronosGroup/OpenCL-CLHPP/main/include/CL/opencl.hpp
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_TARGET_OPENCL_VERSION 120
#include "opencl.hpp"

// Includes the CLBlast library
#include <clblast.h>

// =================================================================================================

// Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the
// result in the memory of matrix B. Uses row-major storage (C-style).
int main() {

  // OpenCL platform/device settings
  const auto platform_id = 0;
  const auto device_id = 0;

  // Example TRSM arguments
  const size_t m = 4;
  const size_t n = 3;
  const double alpha = 1.0;
  const auto a_ld = m;
  const auto b_ld = n;

  // Initializes the OpenCL platform
  auto platforms = std::vector<cl::Platform>();
  cl::Platform::get(&platforms);
  if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; }
  auto platform = platforms[platform_id];

  // Initializes the OpenCL device
  auto devices = std::vector<cl::Device>();
  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
  if (devices.size() == 0 || device_id >= devices.size()) { return 1; }
  auto device = devices[device_id];

  // Creates the OpenCL context, queue, and an event
  auto device_as_vector = std::vector<cl::Device>{device};
  auto context = cl::Context(device_as_vector);
  auto queue = cl::CommandQueue(context, device);
  auto event = cl_event{nullptr};

  // Populate host matrices with some example data
  auto host_a = std::vector<double>({1.0,  2.0,  1.0, -2.0,
                                    0.0, -1.0, -2.0,  0.0,
                                    0.0,  0.0,  1.0,  1.0,
                                    0.0,  0.0,  0.0, -1.0});
  auto host_b = std::vector<double>({-1.0, -1.0,  3.0,
                                     1.0, -3.0,  2.0,
                                     1.0,  1.0, -1.0,
                                     4.0, -1.0, -2.0});
  // Expected result:
  //   8 -5  2
  // -11  3  4
  //   5  0 -3
  //  -4  1  2

  // Copy the matrices to the device
  auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double));
  auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double));
  queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data());
  queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());

  // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision.
  auto queue_plain = queue();
  auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft,
                              clblast::Triangle::kUpper, clblast::Transpose::kNo,
                              clblast::Diagonal::kNonUnit,
                              m, n,
                              alpha,
                              device_a(), 0, a_ld,
                              device_b(), 0, b_ld,
                              &queue_plain, &event);

  // Retrieves the results
  if (status == clblast::StatusCode::kSuccess) {
    clWaitForEvents(1, &event);
    clReleaseEvent(event);
  }
  queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data());

  // Example completed. See "clblast.h" for status codes (0 -> success).
  printf("Completed TRSM with status %d and results:\n", static_cast<int>(status));
  for (auto i = size_t{0}; i < m; ++i) {
    for (auto j = size_t{0}; j < n; ++j) {
      printf("%3.0f ", host_b[i * b_ld + j]);
    }
    printf("\n");
  }
  return 0;
}

// =================================================================================================