summaryrefslogtreecommitdiff
path: root/src/routines
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2017-10-20 12:07:30 +0200
committerGitHub <noreply@github.com>2017-10-20 12:07:30 +0200
commit42dcd8fd8a81c66783827dc4826117b3af610376 (patch)
treea321cdec1fbb96ec54257b76dccb91184f01b015 /src/routines
parent48133a0cd1a7b61b87906ec1f4608e766e20a973 (diff)
parent363568787ebfcdc0c5e6af9c3c8e71c702e2f951 (diff)
Merge pull request #204 from CNugteren/cuda_api
Cuda API to CLBlast
Diffstat (limited to 'src/routines')
-rw-r--r--src/routines/common.hpp3
-rw-r--r--src/routines/levelx/xaxpybatched.cpp6
-rw-r--r--src/routines/levelx/xgemmbatched.cpp22
-rw-r--r--src/routines/routines.hpp76
4 files changed, 91 insertions, 16 deletions
diff --git a/src/routines/common.hpp b/src/routines/common.hpp
index 84ccd9d2..bf3b1762 100644
--- a/src/routines/common.hpp
+++ b/src/routines/common.hpp
@@ -19,8 +19,7 @@
#include <string>
#include <vector>
-#include "clpp11.hpp"
-#include "clblast.h"
+#include "utilities/utilities.hpp"
#include "database/database.hpp"
namespace clblast {
diff --git a/src/routines/levelx/xaxpybatched.cpp b/src/routines/levelx/xaxpybatched.cpp
index 0b755ccf..52c27b78 100644
--- a/src/routines/levelx/xaxpybatched.cpp
+++ b/src/routines/levelx/xaxpybatched.cpp
@@ -59,9 +59,9 @@ void XaxpyBatched<T>::DoAxpyBatched(const size_t n, const std::vector<T> &alphas
x_offsets_int[batch] = static_cast<int>(x_offsets[batch]);
y_offsets_int[batch] = static_cast<int>(y_offsets[batch]);
}
- auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+ auto x_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto y_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
x_offsets_device.Write(queue_, batch_count, x_offsets_int);
y_offsets_device.Write(queue_, batch_count, y_offsets_int);
alphas_device.Write(queue_, batch_count, alphas);
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 4e9f0004..8a015e97 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -100,8 +100,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
}
// Upload the scalar arguments to the device
- auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
- auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+ auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
+ auto betas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
alphas_device.Write(queue_, batch_count, alphas);
betas_device.Write(queue_, batch_count, betas);
@@ -200,8 +200,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
- auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
a_offsets_device.Write(queue_, batch_count, a_offsets);
a_offsets_i_device.Write(queue_, batch_count, a_offsets_i);
auto eventProcessA = Event();
@@ -214,8 +214,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
// As above, but now for matrix B
if (!b_no_temp) {
- auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
b_offsets_device.Write(queue_, batch_count, b_offsets);
b_offsets_i_device.Write(queue_, batch_count, b_offsets_i);
auto eventProcessB = Event();
@@ -227,8 +227,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
}
// As above, but now for matrix C
- auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
if (!c_no_temp) {
c_offsets_device.Write(queue_, batch_count, c_offsets);
c_offsets_i_device.Write(queue_, batch_count, c_offsets_i);
@@ -297,9 +297,9 @@ void XgemmBatched<T>::BatchedGemmDirect(const size_t m, const size_t n, const si
const size_t batch_count) {
// Uploads the offsets to the device
- auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
a_offsets_device.Write(queue_, batch_count, a_offsets);
b_offsets_device.Write(queue_, batch_count, b_offsets);
c_offsets_device.Write(queue_, batch_count, c_offsets);
diff --git a/src/routines/routines.hpp b/src/routines/routines.hpp
new file mode 100644
index 00000000..9e7768b9
--- /dev/null
+++ b/src/routines/routines.hpp
@@ -0,0 +1,76 @@
+
+// =================================================================================================
+// This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
+// project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+// width of 100 characters per line.
+//
+// Author(s):
+// Cedric Nugteren <www.cedricnugteren.nl>
+//
+// This file contains all the includes of all the routines in CLBlast.
+//
+// =================================================================================================
+
+#ifndef CLBLAST_ROUTINES_ROUTINES_H_
+#define CLBLAST_ROUTINES_ROUTINES_H_
+
+// BLAS level-1 includes
+#include "routines/level1/xswap.hpp"
+#include "routines/level1/xscal.hpp"
+#include "routines/level1/xcopy.hpp"
+#include "routines/level1/xaxpy.hpp"
+#include "routines/level1/xdot.hpp"
+#include "routines/level1/xdotu.hpp"
+#include "routines/level1/xdotc.hpp"
+#include "routines/level1/xnrm2.hpp"
+#include "routines/level1/xasum.hpp"
+#include "routines/level1/xsum.hpp" // non-BLAS routine
+#include "routines/level1/xamax.hpp"
+#include "routines/level1/xamin.hpp" // non-BLAS routine
+#include "routines/level1/xmax.hpp" // non-BLAS routine
+#include "routines/level1/xmin.hpp" // non-BLAS routine
+
+// BLAS level-2 includes
+#include "routines/level2/xgemv.hpp"
+#include "routines/level2/xgbmv.hpp"
+#include "routines/level2/xhemv.hpp"
+#include "routines/level2/xhbmv.hpp"
+#include "routines/level2/xhpmv.hpp"
+#include "routines/level2/xsymv.hpp"
+#include "routines/level2/xsbmv.hpp"
+#include "routines/level2/xspmv.hpp"
+#include "routines/level2/xtrmv.hpp"
+#include "routines/level2/xtbmv.hpp"
+#include "routines/level2/xtpmv.hpp"
+#include "routines/level2/xtrsv.hpp"
+#include "routines/level2/xger.hpp"
+#include "routines/level2/xgeru.hpp"
+#include "routines/level2/xgerc.hpp"
+#include "routines/level2/xher.hpp"
+#include "routines/level2/xhpr.hpp"
+#include "routines/level2/xher2.hpp"
+#include "routines/level2/xhpr2.hpp"
+#include "routines/level2/xsyr.hpp"
+#include "routines/level2/xspr.hpp"
+#include "routines/level2/xsyr2.hpp"
+#include "routines/level2/xspr2.hpp"
+
+// BLAS level-3 includes
+#include "routines/level3/xgemm.hpp"
+#include "routines/level3/xsymm.hpp"
+#include "routines/level3/xhemm.hpp"
+#include "routines/level3/xsyrk.hpp"
+#include "routines/level3/xherk.hpp"
+#include "routines/level3/xsyr2k.hpp"
+#include "routines/level3/xher2k.hpp"
+#include "routines/level3/xtrmm.hpp"
+#include "routines/level3/xtrsm.hpp"
+
+// Level-x includes (non-BLAS)
+#include "routines/levelx/xomatcopy.hpp"
+#include "routines/levelx/xim2col.hpp"
+#include "routines/levelx/xaxpybatched.hpp"
+#include "routines/levelx/xgemmbatched.hpp"
+
+// CLBLAST_ROUTINES_ROUTINES_H_
+#endif