summaryrefslogtreecommitdiff
path: root/src/routines/levelx/xgemmbatched.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/routines/levelx/xgemmbatched.cpp')
-rw-r--r--src/routines/levelx/xgemmbatched.cpp22
1 files changed, 11 insertions, 11 deletions
diff --git a/src/routines/levelx/xgemmbatched.cpp b/src/routines/levelx/xgemmbatched.cpp
index 4e9f0004..8a015e97 100644
--- a/src/routines/levelx/xgemmbatched.cpp
+++ b/src/routines/levelx/xgemmbatched.cpp
@@ -100,8 +100,8 @@ void XgemmBatched<T>::DoGemmBatched(const Layout layout, const Transpose a_trans
}
// Upload the scalar arguments to the device
- auto alphas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
- auto betas_device = Buffer<T>(context_, BufferAccess::kReadOnly, batch_count);
+ auto alphas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
+ auto betas_device = Buffer<T>(context_, BufferAccess::kReadWrite, batch_count);
alphas_device.Write(queue_, batch_count, alphas);
betas_device.Write(queue_, batch_count, betas);
@@ -200,8 +200,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
// to fill it up until it reaches a certain multiple of size (kernel parameter dependent). In
// case nothing has to be done, these kernels can be skipped.
if (!a_no_temp) {
- auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto a_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
a_offsets_device.Write(queue_, batch_count, a_offsets);
a_offsets_i_device.Write(queue_, batch_count, a_offsets_i);
auto eventProcessA = Event();
@@ -214,8 +214,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
// As above, but now for matrix B
if (!b_no_temp) {
- auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto b_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
b_offsets_device.Write(queue_, batch_count, b_offsets);
b_offsets_i_device.Write(queue_, batch_count, b_offsets_i);
auto eventProcessB = Event();
@@ -227,8 +227,8 @@ void XgemmBatched<T>::BatchedGemmIndirect(const size_t m, const size_t n, const
}
// As above, but now for matrix C
- auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto c_offsets_i_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
if (!c_no_temp) {
c_offsets_device.Write(queue_, batch_count, c_offsets);
c_offsets_i_device.Write(queue_, batch_count, c_offsets_i);
@@ -297,9 +297,9 @@ void XgemmBatched<T>::BatchedGemmDirect(const size_t m, const size_t n, const si
const size_t batch_count) {
// Uploads the offsets to the device
- auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
- auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadOnly, batch_count);
+ auto a_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto b_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
+ auto c_offsets_device = Buffer<int>(context_, BufferAccess::kReadWrite, batch_count);
a_offsets_device.Write(queue_, batch_count, a_offsets);
b_offsets_device.Write(queue_, batch_count, b_offsets);
c_offsets_device.Write(queue_, batch_count, c_offsets);