summaryrefslogtreecommitdiff
path: root/src/clblast_cuda.cpp
diff options
context:
space:
mode:
authorCedric Nugteren <web@cedricnugteren.nl>2018-10-23 20:52:25 +0200
committerCedric Nugteren <web@cedricnugteren.nl>2018-10-23 20:52:25 +0200
commitd45911b61dedafcbd74f65df263b4197697d6a81 (patch)
tree1f5d30db2f1b77a8258c5d0b331581938574d17e /src/clblast_cuda.cpp
parent44b630fc222c6e22446c20995411994b51bc2f21 (diff)
Added groundwork for col2im algorithm plus first non-working version of kernel and test
Diffstat (limited to 'src/clblast_cuda.cpp')
-rw-r--r--src/clblast_cuda.cpp38
1 files changed, 38 insertions, 0 deletions
diff --git a/src/clblast_cuda.cpp b/src/clblast_cuda.cpp
index f14806cb..03d995ba 100644
--- a/src/clblast_cuda.cpp
+++ b/src/clblast_cuda.cpp
@@ -2350,6 +2350,44 @@ template StatusCode PUBLIC_API Im2col<half>(const size_t, const size_t, const si
CUdeviceptr, const size_t,
const CUcontext, const CUdevice);
+// Col2im function (non-BLAS function): SCOL2IM/DCOL2IM/CCOL2IM/ZCOL2IM/HCOL2IM
+template <typename T>
+StatusCode Col2im(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w,
+ const CUdeviceptr col_buffer, const size_t col_offset,
+ CUdeviceptr im_buffer, const size_t im_offset,
+ const CUcontext context, const CUdevice device) {
+ try {
+ const auto context_cpp = Context(context);
+ const auto device_cpp = Device(device);
+ auto queue_cpp = Queue(context_cpp, device_cpp);
+ auto routine = Xcol2im<T>(queue_cpp, nullptr);
+ routine.DoCol2im(channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+ Buffer<T>(col_buffer), col_offset,
+ Buffer<T>(im_buffer), im_offset);
+ return StatusCode::kSuccess;
+ } catch (...) { return DispatchException(); }
+}
+template StatusCode PUBLIC_API Col2im<float>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<double>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<float2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<double2>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUcontext, const CUdevice);
+template StatusCode PUBLIC_API Col2im<half>(const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t, const size_t,
+ const CUdeviceptr, const size_t,
+ CUdeviceptr, const size_t,
+ const CUcontext, const CUdevice);
+
// Batched convolution as GEMM (non-BLAS function): SCONVGEMM/DCONVGEMM/HCONVGEMM
template <typename T>
StatusCode Convgemm(const size_t channels, const size_t height, const size_t width, const size_t kernel_h, const size_t kernel_w, const size_t pad_h, const size_t pad_w, const size_t stride_h, const size_t stride_w, const size_t dilation_h, const size_t dilation_w, const size_t num_kernels, const size_t batch_count,