Changed col2im to append to the existing im-buffer

author: Cedric Nugteren <web@cedricnugteren.nl> 2018-11-07 19:45:07 +0100
committer: Cedric Nugteren <web@cedricnugteren.nl> 2018-11-07 19:45:07 +0100
commit: 6f67525ea693d0761c479b060c04ce93d408beb5 (patch)
tree: 8e76bcfa9bf9d067a3969c70023634c66a9657f4
parent: 2d32a23293cdc5e0e34160e9ee4f15dc824592b3 (diff)
4 files changed, 7 insertions, 14 deletions
diff --git a/doc/api.md b/doc/api.md
index 64b4a1c8..337b5af9 100644
--- a/doc/api.md
+++ b/doc/api.md
@@ -3015,7 +3015,7 @@ Requirements for OMATCOPY:
 xIM2COL: Im2col function (non-BLAS function)
 -------------
 
-Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.
+Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer
 
 C++ API:
 ```
@@ -3075,7 +3075,7 @@ Arguments to IM2COL:
 xCOL2IM: Col2im function (non-BLAS function)
 -------------
 
-Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.
+Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer.
 
 C++ API:
 ```
diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py
index 27107739..f8022d81 100755
--- a/scripts/generator/generator.py
+++ b/scripts/generator/generator.py
@@ -180,8 +180,8 @@ ROUTINES = [
   # Special routines:
   Routine(True,  True,  0, False, "x", "had",      T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x","y"],  ["z"],                        [xn,yn,zn],      ["alpha","beta"], "",    "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []),
   Routine(True,  True,  0, False, "x", "omatcopy", T, [S,D,C,Z,H],   ["m","n"],            ["layout","a_transpose"],                              ["a"],      ["b"],                        [amn,bnma],      ["alpha"],        "",    "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]),
-  Routine(True,  True,  0, False, "x", "im2col",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["im"],     ["col"],                      [im,col],        [""],             "",    "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []),
-  Routine(True,  True,  0, False, "x", "col2im",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["col"],    ["im"],                       [col,im],        [""],             "",    "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.", []),
+  Routine(True,  True,  0, False, "x", "im2col",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["im"],     ["col"],                      [im,col],        [""],             "",    "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer", []),
+  Routine(True,  True,  0, False, "x", "col2im",   T, [S,D,C,Z,H],   im2col_constants,     [],                                                    ["col"],    ["im"],                       [col,im],        [""],             "",    "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer.", []),
   Routine(True,  True,  0, False, "x", "convgemm", T, [S,D,H],       convgemm_constants,   [],                                                    ["im","kernel"], ["result"],              [imb,kernel,result],[""],          "",    "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []),
   # Batched routines:
   Routine(True,  True,  1, False, "x", "axpy",     T, [S,D,C,Z,H],   ["n"],                [],                                                    ["x"],      ["y"],                        [xn,yn],         ["alpha"],        "",    "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []),
diff --git a/src/kernels/levelx/col2im.opencl b/src/kernels/levelx/col2im.opencl
index 5cadeec6..a37db24f 100644
--- a/src/kernels/levelx/col2im.opencl
+++ b/src/kernels/levelx/col2im.opencl
@@ -80,9 +80,10 @@ void col2im(const int input_h, const int input_w, const int channels,
       }
     }
 
-    // Sets the resulting value
+    // Accumulates the resulting value with the existing im-buffer (+= val)
     const int input_index = w_index + input_w * (h_index + input_h * c_id);
-    im_buffer[input_index + im_offset] = val;
+    real im_buffer_value = im_buffer[input_index + im_offset];
+    Add(im_buffer[input_index + im_offset], im_buffer_value, val);
   }
 }
 
diff --git a/test/routines/levelx/xcol2im.hpp b/test/routines/levelx/xcol2im.hpp
index 69e8c6c1..176fceae 100644
--- a/test/routines/levelx/xcol2im.hpp
+++ b/test/routines/levelx/xcol2im.hpp
@@ -160,14 +160,6 @@ StatusCode RunReference(const Arguments<T> &args, BuffersHost<T> &buffers_host)
   const auto col_h = TestXcol2im<T>::ColHeight(args);
   const auto col_w = TestXcol2im<T>::ColWidth(args);
 
-  for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) {
-    for (auto h_index = size_t{0}; h_index < args.height; ++h_index) {
-      for (auto w_index = size_t{0}; w_index < args.width; ++w_index) {
-        const auto im_index = w_index + args.width * (h_index + args.height * c_id);
-        buffers_host.a_mat[im_index + args.a_offset] = 0;
-      }
-    }
-  }
   for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // image channels
     for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height
       for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width
author	Cedric Nugteren <web@cedricnugteren.nl>	2018-11-07 19:45:07 +0100
committer	Cedric Nugteren <web@cedricnugteren.nl>	2018-11-07 19:45:07 +0100
commit	6f67525ea693d0761c479b060c04ce93d408beb5 (patch)
tree	8e76bcfa9bf9d067a3969c70023634c66a9657f4
parent	2d32a23293cdc5e0e34160e9ee4f15dc824592b3 (diff)