1 files changed, 0 insertions, 415 deletions
diff --git a/src/routine.cc b/src/routine.cc
deleted file mode 100644
index eee4c7cc..00000000
--- a/src/routine.cc
+++ /dev/null
@@ -1,415 +0,0 @@
-
-// =================================================================================================
-// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
-// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
-// width of 100 characters per line.
-//
-// Author(s):
-//   Cedric Nugteren <www.cedricnugteren.nl>
-//
-// This file implements the Routine base class (see the header for information about the class).
-//
-// =================================================================================================
-
-#include <string>
-#include <vector>
-
-#include "internal/routine.h"
-
-namespace clblast {
-// =================================================================================================
-
-// Constructor: not much here, because no status codes can be returned
-template <typename T>
-Routine<T>::Routine(Queue &queue, EventPointer event, const std::string &name,
-                    const std::vector<std::string> &routines, const Precision precision):
-    precision_(precision),
-    routine_name_(name),
-    queue_(queue),
-    event_(event),
-    context_(queue_.GetContext()),
-    device_(queue_.GetDevice()),
-    device_name_(device_.Name()),
-    max_work_item_dimensions_(device_.MaxWorkItemDimensions()),
-    max_work_item_sizes_(device_.MaxWorkItemSizes()),
-    max_work_group_size_(device_.MaxWorkGroupSize()),
-    db_(queue_, routines, precision_) {
-}
-
-// =================================================================================================
-
-// Separate set-up function to allow for status codes to be returned
-template <typename T>
-StatusCode Routine<T>::SetUp() {
-
-  // Queries the cache to see whether or not the program (context-specific) is already there
-  if (ProgramIsInCache()) { return StatusCode::kSuccess; }
-
-  // Queries the cache to see whether or not the binary (device-specific) is already there. If it
-  // is, a program is created and stored in the cache
-  if (BinaryIsInCache()) {
-    try {
-      auto& binary = cache::GetBinaryFromCache(device_name_, precision_, routine_name_);
-      auto program = Program(device_, context_, binary);
-      auto options = std::vector<std::string>();
-      program.Build(device_, options);
-      StoreProgramToCache(program);
-    } catch (...) { return StatusCode::kBuildProgramFailure; }
-    return StatusCode::kSuccess;
-  }
-
-  // Otherwise, the kernel will be compiled and program will be built. Both the binary and the
-  // program will be added to the cache.
-
-  // Inspects whether or not cl_khr_fp64 is supported in case of double precision
-  auto extensions = device_.Capabilities();
-  if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) {
-    if (extensions.find(kKhronosDoublePrecision) == std::string::npos) {
-      return StatusCode::kNoDoublePrecision;
-    }
-  }
-
-  // As above, but for cl_khr_fp16 (half precision)
-  if (precision_ == Precision::kHalf) {
-    if (extensions.find(kKhronosHalfPrecision) == std::string::npos) {
-      return StatusCode::kNoHalfPrecision;
-    }
-  }
-
-  // Loads the common header (typedefs and defines and such)
-  std::string common_header =
-    #include "kernels/common.opencl"
-  ;
-
-  // Collects the parameters for this device in the form of defines, and adds the precision
-  auto defines = db_.GetDefines();
-  defines += "#define PRECISION "+ToString(static_cast<int>(precision_))+"\n";
-
-  // Adds the name of the routine as a define
-  defines += "#define ROUTINE_"+routine_name_+"\n";
-
-  // Determines whether this is a specific device
-  const auto isAMD = device_.Vendor() == "AMD" || device_.Vendor() == "Advanced Micro Devices, Inc.";
-  const auto isGPU = device_.Type() == "GPU";
-
-  // For specific devices, use the non-IEE754 compilant OpenCL mad() instruction. This can improve
-  // performance, but might result in a reduced accuracy.
-  if (isAMD && isGPU) {
-    defines += "#define USE_CL_MAD 1\n";
-  }
-
-  // For specific devices, use staggered/shuffled workgroup indices.
-  if (isAMD && isGPU) {
-    defines += "#define USE_STAGGERED_INDICES 1\n";
-  }
-
-  // Combines everything together into a single source string
-  auto source_string = defines + common_header + source_string_;
-
-  // Compiles the kernel
-  try {
-    auto program = Program(context_, source_string);
-    auto options = std::vector<std::string>();
-    auto build_status = program.Build(device_, options);
-
-    // Checks for compiler crashes/errors/warnings
-    if (build_status == BuildStatus::kError) {
-      auto message = program.GetBuildInfo(device_);
-      fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str());
-      return StatusCode::kBuildProgramFailure;
-    }
-    if (build_status == BuildStatus::kInvalid) { return StatusCode::kInvalidBinary; }
-
-    // Store the compiled binary and program in the cache
-    const auto binary = program.GetIR();
-    StoreBinaryToCache(binary);
-    StoreProgramToCache(program);
-  } catch (...) { return StatusCode::kBuildProgramFailure; }
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Enqueues a kernel, waits for completion, and checks for errors
-template <typename T>
-StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                                 const std::vector<size_t> &local, EventPointer event,
-                                 std::vector<Event>& waitForEvents) {
-
-  // Tests for validity of the local thread sizes
-  if (local.size() > max_work_item_dimensions_) {
-    return StatusCode::kInvalidLocalNumDimensions; 
-  }
-  for (auto i=size_t{0}; i<local.size(); ++i) {
-    if (local[i] > max_work_item_sizes_[i]) { return StatusCode::kInvalidLocalThreadsDim; }
-  }
-  auto local_size = size_t{1};
-  for (auto &item: local) { local_size *= item; }
-  if (local_size > max_work_group_size_) { return StatusCode::kInvalidLocalThreadsTotal; }
-
-  // Make sure the global thread sizes are at least equal to the local sizes
-  for (auto i=size_t{0}; i<global.size(); ++i) {
-    if (global[i] < local[i]) { global[i] = local[i]; }
-  }
-
-  // Tests for local memory usage
-  auto local_mem_usage = kernel.LocalMemUsage(device_);
-  if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; }
-
-  // Launches the kernel (and checks for launch errors)
-  try {
-    kernel.Launch(queue_, global, local, event, waitForEvents);
-  } catch (...) { return StatusCode::kKernelLaunchError; }
-
-  // No errors, normal termination of this function
-  return StatusCode::kSuccess;
-}
-
-// As above, but without an event waiting list
-template <typename T>
-StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global,
-                                 const std::vector<size_t> &local, EventPointer event) {
-  auto emptyWaitingList = std::vector<Event>();
-  return RunKernel(kernel, global, local, event, emptyWaitingList);
-}
-
-// =================================================================================================
-
-// Tests matrix A for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimA; }
-  try {
-    auto required_size = (ld*(two-1) + one + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix B for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimB; }
-  try {
-    auto required_size = (ld*(two-1) + one + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; }
-  } catch (...) { return StatusCode::kInvalidMatrixB; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix C for validity: checks for a valid OpenCL buffer, a valid lead-dimension, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buffer<T> &buffer,
-                                   const size_t offset, const size_t ld, const size_t data_size) {
-  if (ld < one) { return StatusCode::kInvalidLeadDimC; }
-  try {
-    auto required_size = (ld*(two-1) + one + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; }
-  } catch (...) { return StatusCode::kInvalidMatrixC; }
-  return StatusCode::kSuccess;
-}
-
-// Tests matrix AP for validity: checks for a valid OpenCL buffer and for a sufficient buffer size
-template <typename T>
-StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer,
-                                    const size_t offset, const size_t data_size) {
-  try {
-    auto required_size = (((n*(n+1))/2) + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; }
-  } catch (...) { return StatusCode::kInvalidMatrixA; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector X for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                   const size_t inc, const size_t data_size) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementX; }
-  try {
-    auto required_size = ((n-1)*inc + 1 + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; }
-  } catch (...) { return StatusCode::kInvalidVectorX; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector Y for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                   const size_t inc, const size_t data_size) {
-  if (inc == 0) { return StatusCode::kInvalidIncrementY; }
-  try {
-    auto required_size = ((n-1)*inc + 1 + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; }
-  } catch (...) { return StatusCode::kInvalidVectorY; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Tests vector dot for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset,
-                                     const size_t data_size) {
-  try {
-    auto required_size = (n + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
-  } catch (...) { return StatusCode::kInvalidVectorDot; }
-  return StatusCode::kSuccess;
-}
-
-// Tests vector index for validity: checks for a valid increment, a valid OpenCL buffer, and for a
-// sufficient buffer size.
-template <typename T>
-StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer,
-                                       const size_t offset, const size_t data_size) {
-  try {
-    auto required_size = (n + offset)*data_size;
-    auto buffer_size = buffer.GetSize();
-    if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; }
-  } catch (...) { return StatusCode::kInvalidVectorDot; }
-  return StatusCode::kSuccess;
-}
-
-// =================================================================================================
-
-// Copies or transposes a matrix and pads/unpads it with zeros
-template <typename T>
-StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents,
-                                              const size_t src_one, const size_t src_two,
-                                              const size_t src_ld, const size_t src_offset,
-                                              const Buffer<T> &src,
-                                              const size_t dest_one, const size_t dest_two,
-                                              const size_t dest_ld, const size_t dest_offset,
-                                              const Buffer<T> &dest,
-                                              const Program &program, const bool do_pad,
-                                              const bool do_transpose, const bool do_conjugate,
-                                              const bool upper, const bool lower,
-                                              const bool diagonal_imag_zero) {
-
-  // Determines whether or not the fast-version could potentially be used
-  auto use_fast_kernel = (src_offset == 0) && (dest_offset == 0) && (do_conjugate == false) &&
-                         (src_one == dest_one) && (src_two == dest_two) && (src_ld == dest_ld) &&
-                         (upper == false) && (lower == false) && (diagonal_imag_zero == false);
-
-  // Determines the right kernel
-  auto kernel_name = std::string{};
-  if (do_transpose) {
-    if (use_fast_kernel &&
-        IsMultiple(src_ld, db_["TRA_WPT"]) &&
-        IsMultiple(src_one, db_["TRA_WPT"]*db_["TRA_WPT"]) &&
-        IsMultiple(src_two, db_["TRA_WPT"]*db_["TRA_WPT"])) {
-      kernel_name = "TransposeMatrix";
-    }
-    else {
-      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "PadTransposeMatrix" : "UnPadTransposeMatrix";
-    }
-  }
-  else {
-    if (use_fast_kernel &&
-        IsMultiple(src_ld, db_["COPY_VW"]) &&
-        IsMultiple(src_one, db_["COPY_VW"]*db_["COPY_DIMX"]) &&
-        IsMultiple(src_two, db_["COPY_WPT"]*db_["COPY_DIMY"])) {
-      kernel_name = "CopyMatrix";
-    }
-    else {
-      use_fast_kernel = false;
-      kernel_name = (do_pad) ? "PadMatrix" : "UnPadMatrix";
-    }
-  }
-
-  // Retrieves the kernel from the compiled binary
-  try {
-    auto kernel = Kernel(program, kernel_name);
-
-    // Sets the kernel arguments
-    if (use_fast_kernel) {
-      kernel.SetArgument(0, static_cast<int>(src_ld));
-      kernel.SetArgument(1, src());
-      kernel.SetArgument(2, dest());
-    }
-    else {
-      kernel.SetArgument(0, static_cast<int>(src_one));
-      kernel.SetArgument(1, static_cast<int>(src_two));
-      kernel.SetArgument(2, static_cast<int>(src_ld));
-      kernel.SetArgument(3, static_cast<int>(src_offset));
-      kernel.SetArgument(4, src());
-      kernel.SetArgument(5, static_cast<int>(dest_one));
-      kernel.SetArgument(6, static_cast<int>(dest_two));
-      kernel.SetArgument(7, static_cast<int>(dest_ld));
-      kernel.SetArgument(8, static_cast<int>(dest_offset));
-      kernel.SetArgument(9, dest());
-      if (do_pad) {
-        kernel.SetArgument(10, static_cast<int>(do_conjugate));
-      }
-      else {
-        kernel.SetArgument(10, static_cast<int>(upper));
-        kernel.SetArgument(11, static_cast<int>(lower));
-        kernel.SetArgument(12, static_cast<int>(diagonal_imag_zero));
-      }
-    }
-
-    // Launches the kernel and returns the error code. Uses global and local thread sizes based on
-    // parameters in the database.
-    auto status = StatusCode::kSuccess;
-    if (do_transpose) {
-      if (use_fast_kernel) {
-        auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"],
-                                          dest_two / db_["TRA_WPT"]};
-        auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-      else {
-        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]),
-                                          Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])};
-        auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-    }
-    else {
-      if (use_fast_kernel) {
-        auto global = std::vector<size_t>{dest_one / db_["COPY_VW"],
-                                          dest_two / db_["COPY_WPT"]};
-        auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-      else {
-        auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]),
-                                          Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])};
-        auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]};
-        status = RunKernel(kernel, global, local, event, waitForEvents);
-      }
-    }
-    return status;
-  } catch (...) { return StatusCode::kInvalidKernel; }
-}
-
-// =================================================================================================
-
-// Compiles the templated class
-template class Routine<float>;
-template class Routine<double>;
-template class Routine<float2>;
-template class Routine<double2>;
-
-// =================================================================================================
-} // namespace clblast