diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-15 12:34:05 +0200 |
---|---|---|
committer | Cedric Nugteren <web@cedricnugteren.nl> | 2016-06-15 12:34:05 +0200 |
commit | 39b7dbc5e37829abfbcfb77852b9138b31540b42 (patch) | |
tree | 4f19fb31c5f78504a076fb65331c22efd289c68a | |
parent | b894611ad196fc9cac40bf5861a23b35c52c52b5 (diff) |
Added some constness to variables related to the GEMM routines
-rw-r--r-- | include/internal/routine.h | 4 | ||||
-rw-r--r-- | src/routine.cc | 90 | ||||
-rw-r--r-- | src/routines/level3/xgemm.cc | 48 |
3 files changed, 74 insertions, 68 deletions
diff --git a/include/internal/routine.h b/include/internal/routine.h index 6df186c5..d420e2db 100644 --- a/include/internal/routine.h +++ b/include/internal/routine.h @@ -44,12 +44,12 @@ class Routine { protected: // Runs a kernel given the global and local thread sizes - StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global, + StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global, const std::vector<size_t> &local, EventPointer event, std::vector<Event>& waitForEvents); // As above, but without an event waiting list - StatusCode RunKernel(Kernel &kernel, std::vector<size_t> &global, + StatusCode RunKernel(Kernel &kernel, std::vector<size_t> global, const std::vector<size_t> &local, EventPointer event); // Tests for valid inputs of matrices A, B, and C diff --git a/src/routine.cc b/src/routine.cc index dee1f090..4b334e60 100644 --- a/src/routine.cc +++ b/src/routine.cc @@ -62,7 +62,7 @@ StatusCode Routine<T>::SetUp() { // program will be added to the cache. // Inspects whether or not cl_khr_fp64 is supported in case of double precision - auto extensions = device_.Capabilities(); + const auto extensions = device_.Capabilities(); if (precision_ == Precision::kDouble || precision_ == Precision::kComplexDouble) { if (extensions.find(kKhronosDoublePrecision) == std::string::npos) { return StatusCode::kNoDoublePrecision; @@ -106,17 +106,17 @@ StatusCode Routine<T>::SetUp() { } // Combines everything together into a single source string - auto source_string = defines + common_header + source_string_; + const auto source_string = defines + common_header + source_string_; // Compiles the kernel try { auto program = Program(context_, source_string); auto options = std::vector<std::string>(); - auto build_status = program.Build(device_, options); + const auto build_status = program.Build(device_, options); // Checks for compiler crashes/errors/warnings if (build_status == BuildStatus::kError) { - auto message = program.GetBuildInfo(device_); + const auto message = program.GetBuildInfo(device_); fprintf(stdout, "OpenCL compiler error/warning: %s\n", message.c_str()); return StatusCode::kBuildProgramFailure; } @@ -136,7 +136,7 @@ StatusCode Routine<T>::SetUp() { // Enqueues a kernel, waits for completion, and checks for errors template <typename T> -StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, +StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global, const std::vector<size_t> &local, EventPointer event, std::vector<Event>& waitForEvents) { @@ -157,7 +157,7 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, } // Tests for local memory usage - auto local_mem_usage = kernel.LocalMemUsage(device_); + const auto local_mem_usage = kernel.LocalMemUsage(device_); if (!device_.IsLocalMemoryValid(local_mem_usage)) { return StatusCode::kInvalidLocalMemUsage; } // Launches the kernel (and checks for launch errors) @@ -171,7 +171,7 @@ StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, // As above, but without an event waiting list template <typename T> -StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> &global, +StatusCode Routine<T>::RunKernel(Kernel &kernel, std::vector<size_t> global, const std::vector<size_t> &local, EventPointer event) { auto emptyWaitingList = std::vector<Event>(); return RunKernel(kernel, global, local, event, emptyWaitingList); @@ -186,8 +186,8 @@ StatusCode Routine<T>::TestMatrixA(const size_t one, const size_t two, const Buf const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimA; } try { - auto required_size = (ld*(two-1) + one + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = (ld*(two-1) + one + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } } catch (...) { return StatusCode::kInvalidMatrixA; } return StatusCode::kSuccess; @@ -200,8 +200,8 @@ StatusCode Routine<T>::TestMatrixB(const size_t one, const size_t two, const Buf const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimB; } try { - auto required_size = (ld*(two-1) + one + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = (ld*(two-1) + one + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryB; } } catch (...) { return StatusCode::kInvalidMatrixB; } return StatusCode::kSuccess; @@ -214,8 +214,8 @@ StatusCode Routine<T>::TestMatrixC(const size_t one, const size_t two, const Buf const size_t offset, const size_t ld, const size_t data_size) { if (ld < one) { return StatusCode::kInvalidLeadDimC; } try { - auto required_size = (ld*(two-1) + one + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = (ld*(two-1) + one + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryC; } } catch (...) { return StatusCode::kInvalidMatrixC; } return StatusCode::kSuccess; @@ -226,8 +226,8 @@ template <typename T> StatusCode Routine<T>::TestMatrixAP(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t data_size) { try { - auto required_size = (((n*(n+1))/2) + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = (((n*(n+1))/2) + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryA; } } catch (...) { return StatusCode::kInvalidMatrixA; } return StatusCode::kSuccess; @@ -242,8 +242,8 @@ StatusCode Routine<T>::TestVectorX(const size_t n, const Buffer<T> &buffer, cons const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementX; } try { - auto required_size = ((n-1)*inc + 1 + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = ((n-1)*inc + 1 + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryX; } } catch (...) { return StatusCode::kInvalidVectorX; } return StatusCode::kSuccess; @@ -256,8 +256,8 @@ StatusCode Routine<T>::TestVectorY(const size_t n, const Buffer<T> &buffer, cons const size_t inc, const size_t data_size) { if (inc == 0) { return StatusCode::kInvalidIncrementY; } try { - auto required_size = ((n-1)*inc + 1 + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = ((n-1)*inc + 1 + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryY; } } catch (...) { return StatusCode::kInvalidVectorY; } return StatusCode::kSuccess; @@ -271,8 +271,8 @@ template <typename T> StatusCode Routine<T>::TestVectorDot(const size_t n, const Buffer<T> &buffer, const size_t offset, const size_t data_size) { try { - auto required_size = (n + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = (n + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; } } catch (...) { return StatusCode::kInvalidVectorDot; } return StatusCode::kSuccess; @@ -284,8 +284,8 @@ template <typename T> StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int> &buffer, const size_t offset, const size_t data_size) { try { - auto required_size = (n + offset)*data_size; - auto buffer_size = buffer.GetSize(); + const auto required_size = (n + offset)*data_size; + const auto buffer_size = buffer.GetSize(); if (buffer_size < required_size) { return StatusCode::kInsufficientMemoryDot; } } catch (...) { return StatusCode::kInvalidVectorDot; } return StatusCode::kSuccess; @@ -293,7 +293,7 @@ StatusCode Routine<T>::TestVectorIndex(const size_t n, const Buffer<unsigned int // ================================================================================================= -// Copies or transposes a matrix and pads/unpads it with zeros +// Copies or transposes a matrix and optionally pads/unpads it with zeros template <typename T> StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Event>& waitForEvents, const size_t src_one, const size_t src_two, @@ -372,36 +372,42 @@ StatusCode Routine<T>::PadCopyTransposeMatrix(EventPointer event, std::vector<Ev // Launches the kernel and returns the error code. Uses global and local thread sizes based on // parameters in the database. - auto status = StatusCode::kSuccess; if (do_transpose) { if (use_fast_kernel) { - auto global = std::vector<size_t>{dest_one / db_["TRA_WPT"], - dest_two / db_["TRA_WPT"]}; - auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]}; - status = RunKernel(kernel, global, local, event, waitForEvents); + const auto global = std::vector<size_t>{ + dest_one / db_["TRA_WPT"], + dest_two / db_["TRA_WPT"] + }; + const auto local = std::vector<size_t>{db_["TRA_DIM"], db_["TRA_DIM"]}; + return RunKernel(kernel, global, local, event, waitForEvents); } else { - auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), - Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"])}; - auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; - status = RunKernel(kernel, global, local, event, waitForEvents); + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]), + Ceil(CeilDiv(dest_two, db_["PADTRA_WPT"]), db_["PADTRA_TILE"]) + }; + const auto local = std::vector<size_t>{db_["PADTRA_TILE"], db_["PADTRA_TILE"]}; + return RunKernel(kernel, global, local, event, waitForEvents); } } else { if (use_fast_kernel) { - auto global = std::vector<size_t>{dest_one / db_["COPY_VW"], - dest_two / db_["COPY_WPT"]}; - auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; - status = RunKernel(kernel, global, local, event, waitForEvents); + const auto global = std::vector<size_t>{ + dest_one / db_["COPY_VW"], + dest_two / db_["COPY_WPT"] + }; + const auto local = std::vector<size_t>{db_["COPY_DIMX"], db_["COPY_DIMY"]}; + return RunKernel(kernel, global, local, event, waitForEvents); } else { - auto global = std::vector<size_t>{Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), - Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"])}; - auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; - status = RunKernel(kernel, global, local, event, waitForEvents); + const auto global = std::vector<size_t>{ + Ceil(CeilDiv(dest_one, db_["PAD_WPTX"]), db_["PAD_DIMX"]), + Ceil(CeilDiv(dest_two, db_["PAD_WPTY"]), db_["PAD_DIMY"]) + }; + const auto local = std::vector<size_t>{db_["PAD_DIMX"], db_["PAD_DIMY"]}; + return RunKernel(kernel, global, local, event, waitForEvents); } } - return status; } catch (...) { return StatusCode::kInvalidKernel; } } diff --git a/src/routines/level3/xgemm.cc b/src/routines/level3/xgemm.cc index d08b6038..6fa6a811 100644 --- a/src/routines/level3/xgemm.cc +++ b/src/routines/level3/xgemm.cc @@ -67,27 +67,27 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, // that the Xgemm kernel expects either matrices A and C (in case of row-major) or B (in case of // col-major) to be transformed, so transposing requirements are not the same as whether or not // the matrix is actually transposed in memory. - auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); - auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || - (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); - auto c_rotated = (layout == Layout::kRowMajor); - auto a_do_transpose = a_rotated; - auto b_do_transpose = !b_rotated; - auto c_do_transpose = c_rotated; + const auto a_rotated = (layout == Layout::kColMajor && a_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && a_transpose == Transpose::kNo); + const auto b_rotated = (layout == Layout::kColMajor && b_transpose != Transpose::kNo) || + (layout == Layout::kRowMajor && b_transpose == Transpose::kNo); + const auto c_rotated = (layout == Layout::kRowMajor); + const auto a_do_transpose = a_rotated; + const auto b_do_transpose = !b_rotated; + const auto c_do_transpose = c_rotated; // In case of complex data-types, the transpose can also become a conjugate transpose - auto a_conjugate = (a_transpose == Transpose::kConjugate); - auto b_conjugate = (b_transpose == Transpose::kConjugate); + const auto a_conjugate = (a_transpose == Transpose::kConjugate); + const auto b_conjugate = (b_transpose == Transpose::kConjugate); // Computes the first and second dimensions of the 3 matrices taking into account whether the // matrices are rotated or not - auto a_one = (a_rotated) ? k : m; - auto a_two = (a_rotated) ? m : k; - auto b_one = (b_rotated) ? n : k; - auto b_two = (b_rotated) ? k : n; - auto c_one = (c_rotated) ? n : m; - auto c_two = (c_rotated) ? m : n; + const auto a_one = (a_rotated) ? k : m; + const auto a_two = (a_rotated) ? m : k; + const auto b_one = (b_rotated) ? n : k; + const auto b_two = (b_rotated) ? k : n; + const auto c_one = (c_rotated) ? n : m; + const auto c_two = (c_rotated) ? m : n; // Tests three matrices (A, B, C) for validity, first from a perspective of the OpenCL buffers and // their sizes, and then from a perspective of parameter values (e.g. m, n, k). Tests whether the @@ -104,9 +104,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, if (ErrorIn(status)) { return status; } // Calculates the ceiled versions of m, n, and k - auto m_ceiled = Ceil(m, db_["MWG"]); - auto n_ceiled = Ceil(n, db_["NWG"]); - auto k_ceiled = Ceil(k, db_["KWG"]); + const auto m_ceiled = Ceil(m, db_["MWG"]); + const auto n_ceiled = Ceil(n, db_["NWG"]); + const auto k_ceiled = Ceil(k, db_["KWG"]); // The padded/transposed input/output matrices: if memory allocation fails, throw an exception try { @@ -123,9 +123,9 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, c_do_transpose == false; // Creates the temporary matrices - auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled); - auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); - auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled); + const auto a_temp = (a_no_temp) ? a_buffer : Buffer<T>(context_, k_ceiled*m_ceiled); + const auto b_temp = (b_no_temp) ? b_buffer : Buffer<T>(context_, k_ceiled*n_ceiled); + const auto c_temp = (c_no_temp) ? c_buffer : Buffer<T>(context_, m_ceiled*n_ceiled); // Upload the scalar arguments as constant buffers to the device (needed for half-precision) auto alpha_buffer = Buffer<T>(context_, 1); @@ -187,11 +187,11 @@ StatusCode Xgemm<T>::DoGemm(const Layout layout, kernel.SetArgument(7, c_temp()); // Computes the global and local thread sizes - auto global = std::vector<size_t>{ + const auto global = std::vector<size_t>{ (m_ceiled * db_["MDIMC"]) / db_["MWG"], (n_ceiled * db_["NDIMC"]) / db_["NWG"] }; - auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; + const auto local = std::vector<size_t>{db_["MDIMC"], db_["NDIMC"]}; // Launches the kernel auto eventKernel = Event(); |