diff options
author | Cedric Nugteren <web@cedricnugteren.nl> | 2017-10-20 12:07:30 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-10-20 12:07:30 +0200 |
commit | 42dcd8fd8a81c66783827dc4826117b3af610376 (patch) | |
tree | a321cdec1fbb96ec54257b76dccb91184f01b015 /src/routines | |
parent | 48133a0cd1a7b61b87906ec1f4608e766e20a973 (diff) | |
parent | 363568787ebfcdc0c5e6af9c3c8e71c702e2f951 (diff) |
Merge pull request #204 from CNugteren/cuda_api
Cuda API to CLBlast
Diffstat (limited to 'src/routines')
-rw-r--r-- | src/routines/common.hpp | 3 | ||||
-rw-r--r-- | src/routines/levelx/xaxpybatched.cpp | 6 | ||||
-rw-r--r-- | src/routines/levelx/xgemmbatched.cpp | 22 | ||||
-rw-r--r-- | src/routines/routines.hpp | 76 |
4 files changed, 91 insertions, 16 deletions
diff --git a/src/routines/common.hpp b/src/routines/common.hpp index 84ccd9d2..bf3b1762 100644 --- a/src/routines/common.hpp +++ b/src/routines/common.hpp @@ -19,8 +19,7 @@ #include <string> #include <vector> -#include "clpp11.hpp" -#include "clblast.h" +#include "utilities/utilities.hpp" #include "database/database.hpp" namespace clblast { diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp index 0b755ccf..52c27b78 100644 --- a/src/routines/levelx/xaxpybatched.cpp +++ b/src/routines/levelx/xaxpybatched.cpp @@ -59,9 +59,9 @@ void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas x_offsets_int[batch] = static_cast<int>(x_offsets[batch]); y_offsets_int[batch] = static_cast<int>(y_offsets[batch]); } - auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count); x_offsets_device.Write(queue_, batch_count, x_offsets_int); y_offsets_device.Write(queue_, batch_count, y_offsets_int); alphas_device.Write(queue_, batch_count, alphas); diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp index 4e9f0004..8a015e97 100644 --- a/src/routines/levelx/xgemmbatched.cpp +++ b/src/routines/levelx/xgemmbatched.cpp @@ -100,8 +100,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans } // Upload the scalar arguments to the device - auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); - auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count); + auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count); + auto betas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count); alphas_device.Write(queue_, batch_count, alphas); betas_device.Write(queue_, batch_count, betas); @@ -200,8 +200,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const // to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In // case nothing has to be done, these kernels can be skipped. if (!a_no_temp) { - auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); a_offsets_device.Write(queue_, batch_count, a_offsets); a_offsets_i_device.Write(queue_, batch_count, a_offsets_i); auto eventProcessA = Event(); @@ -214,8 +214,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const // As above, but now for matrix B if (!b_no_temp) { - auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); b_offsets_device.Write(queue_, batch_count, b_offsets); b_offsets_i_device.Write(queue_, batch_count, b_offsets_i); auto eventProcessB = Event(); @@ -227,8 +227,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const } // As above, but now for matrix C - auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); if (!c_no_temp) { c_offsets_device.Write(queue_, batch_count, c_offsets); c_offsets_i_device.Write(queue_, batch_count, c_offsets_i); @@ -297,9 +297,9 @@ void XgemmBatched<T>::BatchedGemmDirect(const size_t m, const size_t n, const si const size_t batch_count) { // Uploads the offsets to the device - auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); - auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count); + auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); + auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count); a_offsets_device.Write(queue_, batch_count, a_offsets); b_offsets_device.Write(queue_, batch_count, b_offsets); c_offsets_device.Write(queue_, batch_count, c_offsets); diff --git a/src/routines/routines.hpp b/src/routines/routines.hpp new file mode 100644 index 00000000..9e7768b9 --- /dev/null +++ b/src/routines/routines.hpp @@ -0,0 +1,76 @@ + +// ================================================================================================= +// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This +// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- +// width of 100 characters per line. +// +// Author(s): +// Cedric Nugteren <www.cedricnugteren.nl> +// +// This file contains all the includes of all the routines in CLBlast. +// +// ================================================================================================= + +#ifndef CLBLAST_ROUTINES_ROUTINES_H_ +#define CLBLAST_ROUTINES_ROUTINES_H_ + +// BLAS level-1 includes +#include "routines/level1/xswap.hpp" +#include "routines/level1/xscal.hpp" +#include "routines/level1/xcopy.hpp" +#include "routines/level1/xaxpy.hpp" +#include "routines/level1/xdot.hpp" +#include "routines/level1/xdotu.hpp" +#include "routines/level1/xdotc.hpp" +#include "routines/level1/xnrm2.hpp" +#include "routines/level1/xasum.hpp" +#include "routines/level1/xsum.hpp" // non-BLAS routine +#include "routines/level1/xamax.hpp" +#include "routines/level1/xamin.hpp" // non-BLAS routine +#include "routines/level1/xmax.hpp" // non-BLAS routine +#include "routines/level1/xmin.hpp" // non-BLAS routine + +// BLAS level-2 includes +#include "routines/level2/xgemv.hpp" +#include "routines/level2/xgbmv.hpp" +#include "routines/level2/xhemv.hpp" +#include "routines/level2/xhbmv.hpp" +#include "routines/level2/xhpmv.hpp" +#include "routines/level2/xsymv.hpp" +#include "routines/level2/xsbmv.hpp" +#include "routines/level2/xspmv.hpp" +#include "routines/level2/xtrmv.hpp" +#include "routines/level2/xtbmv.hpp" +#include "routines/level2/xtpmv.hpp" +#include "routines/level2/xtrsv.hpp" +#include "routines/level2/xger.hpp" +#include "routines/level2/xgeru.hpp" +#include "routines/level2/xgerc.hpp" +#include "routines/level2/xher.hpp" +#include "routines/level2/xhpr.hpp" +#include "routines/level2/xher2.hpp" +#include "routines/level2/xhpr2.hpp" +#include "routines/level2/xsyr.hpp" +#include "routines/level2/xspr.hpp" +#include "routines/level2/xsyr2.hpp" +#include "routines/level2/xspr2.hpp" + +// BLAS level-3 includes +#include "routines/level3/xgemm.hpp" +#include "routines/level3/xsymm.hpp" +#include "routines/level3/xhemm.hpp" +#include "routines/level3/xsyrk.hpp" +#include "routines/level3/xherk.hpp" +#include "routines/level3/xsyr2k.hpp" +#include "routines/level3/xher2k.hpp" +#include "routines/level3/xtrmm.hpp" +#include "routines/level3/xtrsm.hpp" + +// Level-x includes (non-BLAS) +#include "routines/levelx/xomatcopy.hpp" +#include "routines/levelx/xim2col.hpp" +#include "routines/levelx/xaxpybatched.hpp" +#include "routines/levelx/xgemmbatched.hpp" + +// CLBLAST_ROUTINES_ROUTINES_H_ +#endif |