From 6f67525ea693d0761c479b060c04ce93d408beb5 Mon Sep 17 00:00:00 2001 From: Cedric Nugteren Date: Wed, 7 Nov 2018 19:45:07 +0100 Subject: Changed col2im to append to the existing im-buffer --- doc/api.md | 4 ++-- scripts/generator/generator.py | 4 ++-- src/kernels/levelx/col2im.opencl | 5 +++-- test/routines/levelx/xcol2im.hpp | 8 -------- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/doc/api.md b/doc/api.md index 64b4a1c8..337b5af9 100644 --- a/doc/api.md +++ b/doc/api.md @@ -3015,7 +3015,7 @@ Requirements for OMATCOPY: xIM2COL: Im2col function (non-BLAS function) ------------- -Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. +Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer C++ API: ``` @@ -3075,7 +3075,7 @@ Arguments to IM2COL: xCOL2IM: Col2im function (non-BLAS function) ------------- -Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. +Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer. C++ API: ``` diff --git a/scripts/generator/generator.py b/scripts/generator/generator.py index 27107739..f8022d81 100755 --- a/scripts/generator/generator.py +++ b/scripts/generator/generator.py @@ -180,8 +180,8 @@ ROUTINES = [ # Special routines: Routine(True, True, 0, False, "x", "had", T, [S,D,C,Z,H], ["n"], [], ["x","y"], ["z"], [xn,yn,zn], ["alpha","beta"], "", "Element-wise vector product (Hadamard)", "Performs the Hadamard element-wise product _z = alpha * x * y + beta * z_, in which _x_, _y_, and _z_ are vectors and _alpha_ and _beta_ are scalar constants.", []), Routine(True, True, 0, False, "x", "omatcopy", T, [S,D,C,Z,H], ["m","n"], ["layout","a_transpose"], ["a"], ["b"], [amn,bnma], ["alpha"], "", "Scaling and out-place transpose/copy (non-BLAS function)", "Performs scaling and out-of-place transposition/copying of matrices according to _B = alpha*op(A)_, in which _A_ is an input matrix (_m_ rows by _n_ columns), _B_ an output matrix, and _alpha_ a scalar value. The operation _op_ can be a normal matrix copy, a transposition or a conjugate transposition.", [ald_m, bld_n]), - Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix.", []), - Routine(True, True, 0, False, "x", "col2im", T, [S,D,C,Z,H], im2col_constants, [], ["col"], ["im"], [col,im], [""], "", "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix.", []), + Routine(True, True, 0, False, "x", "im2col", T, [S,D,C,Z,H], im2col_constants, [], ["im"], ["col"], [im,col], [""], "", "Im2col function (non-BLAS function)", "Performs the im2col algorithm, in which _im_ is the input matrix and _col_ is the output matrix. Overwrites any existing values in the _col_ buffer", []), + Routine(True, True, 0, False, "x", "col2im", T, [S,D,C,Z,H], im2col_constants, [], ["col"], ["im"], [col,im], [""], "", "Col2im function (non-BLAS function)", "Performs the col2im algorithm, in which _col_ is the input matrix and _im_ is the output matrix. Accumulates results on top of the existing values in the _im_ buffer.", []), Routine(True, True, 0, False, "x", "convgemm", T, [S,D,H], convgemm_constants, [], ["im","kernel"], ["result"], [imb,kernel,result],[""], "", "Batched convolution as GEMM (non-BLAS function)", "Integrates im2col and GEMM for batched 3D convolution, in which _im_ is the 4D input tensor (NCHW - batch-channelin-height-width), _kernel_ the 4D kernel weights tensor (KCHW - channelout-channelin-height-width), and _result_ the 4D output tensor (NCHW - batch-channelout-height-width).", []), # Batched routines: Routine(True, True, 1, False, "x", "axpy", T, [S,D,C,Z,H], ["n"], [], ["x"], ["y"], [xn,yn], ["alpha"], "", "Batched version of AXPY", "As AXPY, but multiple operations are batched together for better performance.", []), diff --git a/src/kernels/levelx/col2im.opencl b/src/kernels/levelx/col2im.opencl index 5cadeec6..a37db24f 100644 --- a/src/kernels/levelx/col2im.opencl +++ b/src/kernels/levelx/col2im.opencl @@ -80,9 +80,10 @@ void col2im(const int input_h, const int input_w, const int channels, } } - // Sets the resulting value + // Accumulates the resulting value with the existing im-buffer (+= val) const int input_index = w_index + input_w * (h_index + input_h * c_id); - im_buffer[input_index + im_offset] = val; + real im_buffer_value = im_buffer[input_index + im_offset]; + Add(im_buffer[input_index + im_offset], im_buffer_value, val); } } diff --git a/test/routines/levelx/xcol2im.hpp b/test/routines/levelx/xcol2im.hpp index 69e8c6c1..176fceae 100644 --- a/test/routines/levelx/xcol2im.hpp +++ b/test/routines/levelx/xcol2im.hpp @@ -160,14 +160,6 @@ StatusCode RunReference(const Arguments &args, BuffersHost &buffers_host) const auto col_h = TestXcol2im::ColHeight(args); const auto col_w = TestXcol2im::ColWidth(args); - for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { - for (auto h_index = size_t{0}; h_index < args.height; ++h_index) { - for (auto w_index = size_t{0}; w_index < args.width; ++w_index) { - const auto im_index = w_index + args.width * (h_index + args.height * c_id); - buffers_host.a_mat[im_index + args.a_offset] = 0; - } - } - } for (auto c_id = size_t{0}; c_id < args.channels; ++c_id) { // image channels for (auto kh_id = size_t{0}; kh_id < args.kernel_h; ++kh_id) { // kernel height for (auto kw_id = size_t{0}; kw_id < args.kernel_w; ++kw_id) { // kernel width -- cgit v1.2.3