diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
deleted file mode 100644
index 4140c2143..000000000
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ /dev/null
@@ -1,1864 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
-//
-
-#include <ops/declarable/helpers/convolutions.h>
-#include<ops/declarable/helpers/addBias.h>
-#include <ops/declarable/helpers/im2col.h>
-#include <ops/declarable/helpers/col2im.h>
-#include <array/NDArrayFactory.h>
-#include <helpers/MmulHelper.h>
-#include <execution/Threads.h>
-
-namespace sd {
-    namespace ops  {
-
-
-//////////////////////////////////////////////////////////////////////////
-// [bS, iC, iD, iH, iW] is convoluted to [bS, iC, kD, kH, kW, oD, oH, oW]
-        template <typename T>
-        static void vol2col_(const NDArray& volume, NDArray& columns, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-            const int bS = volume.sizeAt(0);
-            const int iC = volume.sizeAt(1);
-            const int iD = volume.sizeAt(2);
-            const int iH = volume.sizeAt(3);
-            const int iW = volume.sizeAt(4);
-            const int kD = columns.sizeAt(2);
-            const int kH = columns.sizeAt(3);
-            const int kW = columns.sizeAt(4);
-            const int oD = columns.sizeAt(5);
-            const int oH = columns.sizeAt(6);
-            const int oW = columns.sizeAt(7);
-            const Nd4jLong colStride0 = columns.stridesOf()[0];
-            const Nd4jLong colStride1 = columns.stridesOf()[1];
-            const Nd4jLong colStride2 = columns.stridesOf()[2];
-            const Nd4jLong colStride3 = columns.stridesOf()[3];
-            const Nd4jLong colStride4 = columns.stridesOf()[4];
-            const Nd4jLong colStride5 = columns.stridesOf()[5];
-            const Nd4jLong colStride6 = columns.stridesOf()[6];
-            const Nd4jLong colStride7 = columns.stridesOf()[7];
-            const Nd4jLong volStride0 = volume.stridesOf()[0];
-            const Nd4jLong volStride1 = volume.stridesOf()[1];
-            const Nd4jLong volStride2 = volume.stridesOf()[2];
-            const Nd4jLong volStride3 = volume.stridesOf()[3];
-            const Nd4jLong volStride4 = volume.stridesOf()[4];
-
-            T* colBuff = columns.bufferAsT<T>();
-            T* volBuff = const_cast<NDArray&>(volume).bufferAsT<T>();
-
-
-            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
-
-                auto func = PRAGMA_THREADS_FOR_3D {
-                    T *col, *vol;
-                    int volDep, volRow, volCol;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int kDep = start_z; kDep < stop_z; kDep += inc_z) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {
-                                    for (int kCol = 0; kCol < kW; ++kCol) {
-                                        for (int colD = 0; colD < oD; ++colD) {
-                                            for (int colH = 0; colH < oH; ++colH) {
-                                                for (int colW = 0; colW < oW; ++colW) {
-
-                                                    volDep = (-pD + kDep * dD) + colD * sD;
-                                                    volRow = (-pH + kRow * dH) + colH * sH;
-                                                    volCol = (-pW + kCol * dW) + colW * sW;
-
-                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
-
-                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
-                                                        *col = static_cast<T>(0.);
-                                                    else {
-                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
-                                                        *col = *vol;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, kD, 1);
-
-            } else {
-
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    T *col, *vol;
-                    int volDep, volRow, volCol;
-                    for (int b = start_x; b < stop_x; b++) {
-                        for (int colD = start_y; colD < stop_y; colD++) {
-                            for (int colH = 0; colH < oH; ++colH) {
-                                for (int colW = 0; colW < oW; ++colW) {
-                                    for (int c = 0; c < iC; ++c) {
-                                        for (int kDep = 0; kDep < kD; ++kDep) {
-                                            for (int kRow = 0; kRow < kH; ++kRow) {
-                                                for (int kCol = 0; kCol < kW; ++kCol) {
-
-                                                    volDep = (-pD + kDep * dD) + colD * sD;
-                                                    volRow = (-pH + kRow * dH) + colH * sH;
-                                                    volCol = (-pW + kCol * dW) + colW * sW;
-
-                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
-
-                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
-                                                        *col = static_cast<T>(0.f);
-                                                    else {
-                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
-                                                        *col = *vol;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, oD, 1);
-                //func(0, 0, bS, 1, 0, oD, 1);
-            }
-        }
-
-//////////////////////////////////////////////////////////////////////////
-// [bS, iC, kD, kH, kW, oD, oH, oW] is de-convoluted to [bS, iC, iD, iH, iW]
-        template <typename T>
-        static void col2vol_(const NDArray& columns, NDArray& volume, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-            // initial zeroing of volume content
-            volume.nullify();
-
-            const int bS = volume.sizeAt(0);
-            const int iC = volume.sizeAt(1);
-            const int iD = volume.sizeAt(2);
-            const int iH = volume.sizeAt(3);
-            const int iW = volume.sizeAt(4);
-            const int kD = columns.sizeAt(2);
-            const int kH = columns.sizeAt(3);
-            const int kW = columns.sizeAt(4);
-            const int oD = columns.sizeAt(5);
-            const int oH = columns.sizeAt(6);
-            const int oW = columns.sizeAt(7);
-            const Nd4jLong colStride0 = columns.stridesOf()[0];
-            const Nd4jLong colStride1 = columns.stridesOf()[1];
-            const Nd4jLong colStride2 = columns.stridesOf()[2];
-            const Nd4jLong colStride3 = columns.stridesOf()[3];
-            const Nd4jLong colStride4 = columns.stridesOf()[4];
-            const Nd4jLong colStride5 = columns.stridesOf()[5];
-            const Nd4jLong colStride6 = columns.stridesOf()[6];
-            const Nd4jLong colStride7 = columns.stridesOf()[7];
-            const Nd4jLong volStride0 = volume.stridesOf()[0];
-            const Nd4jLong volStride1 = volume.stridesOf()[1];
-            const Nd4jLong volStride2 = volume.stridesOf()[2];
-            const Nd4jLong volStride3 = volume.stridesOf()[3];
-            const Nd4jLong volStride4 = volume.stridesOf()[4];
-
-            T* volBuff = volume.bufferAsT<T>();
-            T* colBuff = const_cast<NDArray&>(columns).bufferAsT<T>();
-
-
-            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
-
-                auto func = PRAGMA_THREADS_FOR {
-                    T* col, *vol;
-                    int volDep, volRow, volCol;
-
-                    for (int b = start; b < stop; b++) {
-                        for (int c = 0; c < iC; c++) {
-                            for (int kDep = 0; kDep < kD; ++kDep) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {
-                                    for (int kCol = 0; kCol < kW; ++kCol) {
-                                        for (int colD = 0; colD < oD; ++colD) {
-                                            for (int colH = 0; colH < oH; ++colH) {
-                                                for (int colW = 0; colW < oW; ++colW) {
-
-                                                    volDep = -pD + kDep * dD + colD * sD;
-                                                    volRow = -pH + kRow * dH + colH * sH;
-                                                    volCol = -pW + kCol * dW + colW * sW;
-
-                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
-                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
-                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
-                                                        *vol += *col;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_tad(func, 0, bS);
-
-            } else {
-
-                auto func = PRAGMA_THREADS_FOR {
-                    T* col, *vol;
-                    int volDep, volRow, volCol;
-
-                    for (int b = start; b < stop; b++) {
-                        for (int colD = 0; colD < oD; colD++) {
-                            for (int colH = 0; colH < oH; ++colH) {
-                                for (int colW = 0; colW < oW; ++colW) {
-                                    for (int c = 0; c < iC; ++c) {
-                                        for (int kDep = 0; kDep < kD; ++kDep) {
-                                            for (int kRow = 0; kRow < kH; ++kRow) {
-                                                for (int kCol = 0; kCol < kW; ++kCol) {
-
-                                                    volDep = (-pD + kDep * dD) + colD * sD;
-                                                    volRow = (-pH + kRow * dH) + colH * sH;
-                                                    volCol = (-pW + kCol * dW) + colW * sW;
-
-                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
-                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
-                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
-                                                        *vol += *col;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_tad(func, 0, bS);
-            }
-        }
-
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename X, typename Y>
-        static void conv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-            // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-            // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
-            // bias    [oC]
-            // output  [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
-
-            // kH  filter(kernel) height
-            // kW  filter(kernel) width
-            // sH  strides height
-            // sW  strides width
-            // pH  paddings height
-            // pW  paddings width
-            // dH  dilations height
-            // dW  dilations width
-            // paddingMode 0-VALID, 1-SAME
-            // isNCHW      1-NCHW,  0-NHWC
-
-            int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
-            int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
-
-            ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
-
-            nd4j_debug("MKL-DNN is not used for conv2d!\n", 0);
-
-            std::vector<int> permutForOutput;
-
-            if(isNCHW)
-                permutForOutput = {0, 3, 1, 2};                                             // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
-            else
-                input = new NDArray(input->permute({0, 3, 1, 2}));                         // [bS, iH, iW, iC] -> [bS, iC, iH, iW] if NHWC
-
-            std::vector<int> wAxes;
-            if(0 == wFormat)
-                wAxes = {0, 1, 2};
-            else if(1 == wFormat)
-                wAxes = {2, 3, 1};
-            else
-                wAxes = {1, 2, 3};
-
-            NDArray col('c', {bS, oH, oW, kH, kW, iC}, input->dataType(), input->getContext());
-            NDArray colP = col.permute({0, 5, 3, 4, 1, 2});            // {bS, iC, kH, kW, oH, oW}
-            NDArray mmulResult('f', {bS*oH*oW, oC}, output->dataType(), output->getContext());
-
-            //----- calculation of output -----//
-            auto ctx = block.launchContext();
-            helpers::im2col(*ctx, *input, colP, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-            MmulHelper::tensorDot(&col, weights, &mmulResult, {3,4,5}, wAxes, {}); // [bS, oH, oW, kH, kW, iC] x [kH, kW, iC, oC] = [bS, oH, oW, oC]
-
-            //----- assign outTemp to output  -----//
-            if(isNCHW) {
-                mmulResult.reshapei({bS, oH, oW, oC});
-                mmulResult.permutei(permutForOutput);
-            }
-            output->assign(mmulResult);
-
-            //----- add biases if required -----//
-            if(bias)
-                // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
-                helpers::addBias(block, *output, *bias, *output, isNCHW);
-
-            if(!isNCHW)
-                delete input;
-
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename X, typename Y>
-        static void conv2dBP_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-            // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-            // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
-            // bias    [oC]
-            // gradO   [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-
-            // gradI    [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
-            // gradW    [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
-            // gradB    [oC]
-
-            // kH         filter(kernel) height
-            // kW         filter(kernel) width
-            // sH         strides height
-            // sW         strides width
-            // pH         paddings height
-            // pW         paddings width
-            // dH         dilations height
-            // dW         dilations width
-            // paddingMode 0-VALID, 1-SAME
-            // isNCHW      0-NHWC, 1-NCHW
-
-            int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
-            int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
-
-            ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
-
-            nd4j_debug("MKL-DNN is not used for conv2d_bp!\n", 0);
-
-            std::vector<int> gradOaxesForDot;
-
-            if(!isNCHW) {
-                gradOaxesForDot  = {0, 1, 2};                                           // bS, oH, oW
-                input = new NDArray(input->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
-                gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
-            } else {
-                gradOaxesForDot  = {0, 2, 3};                                           // bS, oH, oW
-            }
-
-            std::vector<int> wPermut, colPermut;
-
-            if(0 == wFormat) {
-                wPermut   = {2, 0, 1, 3};
-                colPermut = {2, 3, 1, 0, 4, 5};
-            }
-            else if(1 == wFormat) {
-                wPermut   = {1, 2, 3, 0};
-                colPermut = {1, 2, 3, 0, 4, 5};
-            }
-            else {
-                wPermut   = {3, 1, 2, 0};
-                colPermut = {2, 3, 1, 0, 4, 5};
-            }
-
-            NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-
-            // ----- calculation of gradW ----- //
-            if(gradW) {
-                auto ctx = block.launchContext();
-                helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-                sd::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, wPermut);       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
-            }
-
-            // ----- calculation of gradB ----- //
-            if(gradB) {
-                NDArray* gradBR = gradB;
-                if(gradB->rankOf() == 2)
-                    gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-                gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot);                          // sum over bS, oH, oW
-                if(gradBR != gradB)
-                    delete gradBR;
-            }
-
-            //----- calculation of gradI -----//
-            // [kH, kW, iC, oC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
-            // [oC, iC, kH, kW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, bS, oH, oW]
-            // [oC, kH, kW, iC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
-            sd::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, colPermut);
-
-            helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-
-            if(!isNCHW) {
-                delete input;
-                delete gradI;
-            }
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename X, typename Y>
-        static void depthwiseConv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-            // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-            // weights   [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-            // bias      [oC] = iC*mC
-            // output    [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
-
-            // kH           filter(kernel) height
-            // kW           filter(kernel) width
-            // sH           strides height
-            // sW           strides width
-            // pH           paddings height
-            // pW           paddings width
-            // dH           dilations height
-            // dW           dilations width
-            // paddingMode  0-VALID, 1-SAME
-            // isNCHW       0-NCHW,  1-NHWC
-
-            int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
-            int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
-            mC = weights->sizeAt(indWmC);                           // channels multiplier
-
-            std::vector<std::vector<Nd4jLong>> modifColumns = {{1,0,4,5,2,3}, {iC,bS*oH*oW,kH*kW}};  // [bS,iC,kH,kW,oH,oW] -> [iC,bS,oH,oW,kH,kW] -> [iC,bS*oH*oW,kH*kW]
-            std::vector<std::vector<Nd4jLong>> modifOutput, modifWeights;
-            std::vector<Nd4jLong> outReShape;
-
-            if(!isNCHW) {
-                outReShape = {bS, oH, oW, iC, mC};                                              // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
-                modifOutput = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-                input = new NDArray(input->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
-            }
-            else {
-                outReShape = {bS, iC, mC, oH, oW};                                              // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
-                modifOutput = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-            }
-
-            if(0 == wFormat)
-                modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
-            else if(1 == wFormat)
-                modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
-            else
-                modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
-
-            if(paddingMode == 1)                       // SAME
-                ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
-
-            NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-            NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
-
-            helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-            MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, modifWeights, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
-
-            if(bias)
-                // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
-                helpers::addBias(block, *output, *bias, *output, isNCHW);
-
-            if(!isNCHW)
-                delete input;
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename X, typename Y>
-        static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-            // input    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
-            // weights  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-            // bias     [oC] = [iC*mC]
-            // gradO    [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
-            // gradI    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
-            // gradW    [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-            // gradB    [oC]
-
-            //  kH          filter(kernel) height
-            //  kW          filter(kernel) width
-            //  sH          strides height
-            //  sW          strides width
-            //  pH          paddings height
-            //  pW          paddings width
-            //  dH          dilations height
-            //  dW          dilations width
-            //  paddingMode 0-VALID, 1-SAME
-            //  isNCHW      0-NHWC, 1-NCHW
-
-            int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
-            int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
-            mC = weights->sizeAt(indWmC);                           // channels multiplier
-
-            std::vector<std::vector<Nd4jLong>> modifColumns = {{1,2,3,0,4,5}, {iC, kH*kW, bS*oH*oW}};      // [bS,iC,kH,kW,oH,oW] -> [iC, kH*kW, bS*oH*oW]
-            std::vector<std::vector<Nd4jLong>> modifGradO1, modifGradO2, modifWeights;
-            std::vector<Nd4jLong> gradOreShape;
-
-            if(!isNCHW) {
-                gradOreShape = {bS, oH, oW, iC, mC};                                            // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
-                modifGradO1 = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-                modifGradO2 = {{3,0,1,2},{iC, mC, bS*oH*oW}};                                   // [bS,oH,oW,iC*mC] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
-                input = new NDArray(input->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
-                gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
-            }
-            else {
-                gradOreShape = {bS, iC, mC, oH, oW};                                            // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
-                modifGradO1 = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-                modifGradO2 = {{1,0,2,3},{iC, mC, bS*oH*oW}};                                   // [bS,iC*mC,oH,oW] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
-            }
-
-            if(0 == wFormat)
-                modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
-            else if(1 == wFormat)
-                modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
-            else
-                modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
-
-            if(paddingMode == 1)                       // SAME
-                ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
-
-            NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-            NDArray gradOreshaped = gradO->reshape(gradO->ordering(), gradOreShape);
-
-            // ----- calculation of gradW and gradB ----- //
-
-            helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-            sd::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, modifWeights);  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
-
-            // ----- calculation of gradB ----- //
-            if(gradB) {
-                NDArray* gradBR = gradB;
-                if(gradB->rankOf() == 2)
-                    gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false));
-                gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1});                      // sum over bS, oH, oW
-
-                if(gradBR != gradB)
-                    delete gradBR;
-            }
-
-            //----- calculation of gradI -----//
-            sd::MmulHelper::tensorDot(weights, gradO, &columns, modifWeights, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
-            helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-
-            if(!isNCHW) {
-                delete input;
-                delete gradI;
-            }
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename X, typename Y>
-        static void sconv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-            // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
-            // weightsDepth  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-            // weightsPoint  [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
-            // bias          [oC], oC = iC*mC if weightsPoint=nullptr
-            // output is     [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)
-
-            //  kH         filter(kernel) height
-            //  kW         filter(kernel) width
-            //  sH         strides height
-            //  sW         strides width
-            //  pH         paddings height
-            //  pW         paddings width
-            //  dH         dilations height
-            //  dW         dilations width
-            //  paddingMode 0-VALID, 1-SAME
-            //  isNCHW      1-NCHW,  0-NHWC
-
-            int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
-            int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
-            mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier
-
-            NDArray* outputDepth = output;
-            if(weightsPoint)                        // if pointwise convolution is expected
-                outputDepth = new NDArray(output->ordering(), !isNCHW ? std::vector<Nd4jLong>({bS, oH, oW, iC*mC}) : std::vector<Nd4jLong>({bS, iC*mC, oH, oW}), input->dataType(), input->getContext());
-
-            // ----- perform depthwise convolution (if weightsPoint is absent then oC = iC*mC) ----- //
-            ConvolutionUtils::depthwiseConv2d(block, input, weightsDepth, weightsPoint ? nullptr : bias, outputDepth, kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, isNCHW, wFormat);
-
-            // ----- perform pointwise convolution (oH = iH, oW = iW) ----- //
-            if (weightsPoint) {
-                ConvolutionUtils::conv2d(block, outputDepth, weightsPoint, bias, output, 1,1, 1,1, 0,0, 1,1, paddingMode, isNCHW, wFormat);             // in this case oH=iH, oW=iW
-                delete outputDepth;
-            }
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void upsampling2d_(const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
-            // input  has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
-            // output has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
-
-            const T* x = input.bufferAsT<T>();
-                  T* z = output.bufferAsT<T>();
-
-            const uint dimIH = isNCHW ? 2 : 1;
-            const uint dimIC = isNCHW ? 1 : 3;
-
-            const uint bS = input.sizeAt(0);
-            const uint iC = input.sizeAt(dimIC);
-            const uint oH = output.sizeAt(dimIH);
-            const uint oW = output.sizeAt(dimIH + 1);
-
-            const Nd4jLong xStride0 = input.stridesOf()[0];
-            const Nd4jLong xStride1 = input.stridesOf()[dimIC];
-            const Nd4jLong xStride2 = input.stridesOf()[dimIH];
-            const Nd4jLong xStride3 = input.stridesOf()[dimIH + 1];
-
-            const Nd4jLong zStride0 = output.stridesOf()[0];
-            const Nd4jLong zStride1 = output.stridesOf()[dimIC];
-            const Nd4jLong zStride2 = output.stridesOf()[dimIH];
-            const Nd4jLong zStride3 = output.stridesOf()[dimIH + 1];
-
-            // loop through output array
-            auto func = PRAGMA_THREADS_FOR_3D {
-                uint xCoord2, xCoord3;
-                for (uint b = start_x; b < stop_x; b += inc_x) {
-                    for (uint c = start_y; c < stop_y; c += inc_y) {
-                        for (uint h = start_z; h < stop_z; h += inc_z) {
-                            for (uint w = 0; w < oW; ++w) {
-                                xCoord2 = h / factorH;
-                                xCoord3 = w / factorW;
-
-                                z[b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3] = x[b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3];
-                            }
-                        }
-                    }
-                }
-            };
-
-            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oH, 1);
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void upsampling3d_(const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
-            // input  has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
-            // output has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
-
-            const T* x = input.bufferAsT<T>();
-                  T* z = output.bufferAsT<T>();
-
-            const uint dimID = isNCDHW ? 2 : 1;
-            const uint dimIC = isNCDHW ? 1 : 4;
-
-            const uint bS = input.sizeAt(0);
-            const uint iC = input.sizeAt(dimIC);
-            const uint oD = output.sizeAt(dimID);
-            const uint oH = output.sizeAt(dimID + 1);
-            const uint oW = output.sizeAt(dimID + 2);
-
-            const Nd4jLong xStride0 = input.stridesOf()[0];
-            const Nd4jLong xStride1 = input.stridesOf()[dimIC];
-            const Nd4jLong xStride2 = input.stridesOf()[dimID];
-            const Nd4jLong xStride3 = input.stridesOf()[dimID + 1];
-            const Nd4jLong xStride4 = input.stridesOf()[dimID + 2];
-
-            const Nd4jLong zStride0 = output.stridesOf()[0];
-            const Nd4jLong zStride1 = output.stridesOf()[dimIC];
-            const Nd4jLong zStride2 = output.stridesOf()[dimID];
-            const Nd4jLong zStride3 = output.stridesOf()[dimID + 1];
-            const Nd4jLong zStride4 = output.stridesOf()[dimID + 2];
-
-            // loop through output array
-            auto func = PRAGMA_THREADS_FOR_3D {
-                uint xCoord2, xCoord3, xCoord4;
-
-                for (uint b = start_x; b < stop_x; b += inc_x) {
-                    for (uint c = start_y; c < stop_y; c += inc_y) {
-                        for (uint d = start_z; d < stop_z; d += inc_z) {
-                            for (uint h = 0; h < oH; ++h) {
-                                for (uint w = 0; w < oW; ++w) {
-
-                                    xCoord2 = d / factorD;
-                                    xCoord3 = h / factorH;
-                                    xCoord4 = w / factorW;
-
-                                    z[b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4] = x[
-                                            b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3 +
-                                            xCoord4 * xStride4];
-                                }
-                            }
-                        }
-                    }
-                }
-            };
-
-            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void upsampling2dBP_(const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
-            // gradO has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
-            // gradI has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
-
-            const T* x = gradO.bufferAsT<T>();
-                  T* z = gradI.bufferAsT<T>();
-
-            const uint dimIH = isNCHW ? 2 : 1;
-            const uint dimIC = isNCHW ? 1 : 3;
-
-            const uint bS = gradI.sizeAt(0);
-            const uint iC = gradI.sizeAt(dimIC);
-            const uint iH = gradI.sizeAt(dimIH);
-            const uint iW = gradI.sizeAt(dimIH + 1);
-
-            const uint factorH = gradO.sizeAt(dimIH)     / iH;
-            const uint factorW = gradO.sizeAt(dimIH + 1) / iW;
-
-            const Nd4jLong xStride0 = gradO.stridesOf()[0];
-            const Nd4jLong xStride1 = gradO.stridesOf()[dimIC];
-            const Nd4jLong xStride2 = gradO.stridesOf()[dimIH];
-            const Nd4jLong xStride3 = gradO.stridesOf()[dimIH + 1];
-
-            const Nd4jLong zStride0 = gradI.stridesOf()[0];
-            const Nd4jLong zStride1 = gradI.stridesOf()[dimIC];
-            const Nd4jLong zStride2 = gradI.stridesOf()[dimIH];
-            const Nd4jLong zStride3 = gradI.stridesOf()[dimIH + 1];
-
-            // loop through output array
-            auto func = PRAGMA_THREADS_FOR_3D {
-                for (uint b = start_x; b < stop_x; b += inc_x) {
-                    for (uint c = start_y; c < stop_y; c += inc_y) {
-                        for (uint h = start_z; h < stop_z; h += inc_z) {
-                            for (uint w = 0; w < iW; ++w) {
-
-                                const auto zOffset = b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3;
-
-                                z[zOffset] = 0;
-
-                                for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
-                                    for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
-                                        z[zOffset] += x[b * xStride0 + c * xStride1 + xh * xStride2 + xw * xStride3];
-                            }
-                        }
-                    }
-                }
-            };
-
-            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iH, 1);
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void upsampling3dBP_(const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
-
-            // input  has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
-            // output has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
-
-            const T* x = gradO.bufferAsT<T>();
-                  T* z = gradI.bufferAsT<T>();
-
-            const uint dimID = isNCDHW ? 2 : 1;
-            const uint dimIC = isNCDHW ? 1 : 4;
-
-            const uint bS = gradI.sizeAt(0);
-            const uint iC = gradI.sizeAt(dimIC);
-            const uint iD = gradI.sizeAt(dimID);
-            const uint iH = gradI.sizeAt(dimID + 1);
-            const uint iW = gradI.sizeAt(dimID + 2);
-
-            const uint factorD = gradO.sizeAt(dimID)     / iD;
-            const uint factorH = gradO.sizeAt(dimID + 1) / iH;
-            const uint factorW = gradO.sizeAt(dimID + 2) / iW;
-
-            const Nd4jLong xStride0 = gradO.stridesOf()[0];
-            const Nd4jLong xStride1 = gradO.stridesOf()[dimIC];
-            const Nd4jLong xStride2 = gradO.stridesOf()[dimID];
-            const Nd4jLong xStride3 = gradO.stridesOf()[dimID + 1];
-            const Nd4jLong xStride4 = gradO.stridesOf()[dimID + 2];
-
-            const Nd4jLong zStride0 = gradI.stridesOf()[0];
-            const Nd4jLong zStride1 = gradI.stridesOf()[dimIC];
-            const Nd4jLong zStride2 = gradI.stridesOf()[dimID];
-            const Nd4jLong zStride3 = gradI.stridesOf()[dimID + 1];
-            const Nd4jLong zStride4 = gradI.stridesOf()[dimID + 2];
-
-            // loop through output array
-            auto func = PRAGMA_THREADS_FOR_3D {
-                for (uint b = start_x; b < stop_x; b += inc_x) {
-                    for (uint c = start_y; c < stop_y; c += inc_y) {
-                        for (uint d = start_z; d < stop_z; d += inc_z) {
-                            for (uint h = 0; h < iH; ++h) {
-                                for (uint w = 0; w < iW; ++w) {
-
-                                    const auto zOffset = b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4;
-
-                                    z[zOffset] = 0;
-
-                                    for (uint xd = d * factorD; xd < d * factorD + factorD; ++xd)
-                                        for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
-                                            for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
-                                                z[zOffset] += x[b * xStride0 + c * xStride1 + xd * xStride2 + xh * xStride3 + xw * xStride4];
-                                }
-                            }
-                        }
-                    }
-                }
-            };
-
-            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iD, 1);
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void pooling2d_(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            // input is  [bS, iC, iH, iW]
-            // output is [bS, iC, oH, oW]
-            T* out = output.bufferAsT<T>();
-            T* in  = const_cast<NDArray&>(input).bufferAsT<T>();
-
-            const int kHEff = kH + (kH-1)*(dH-1);
-            const int kWEff = kW + (kW-1)*(dW-1);
-
-            const int bS = input.sizeAt(0);
-            const int iC = input.sizeAt(1);
-            const int iH = input.sizeAt(2);
-            const int iW = input.sizeAt(3);
-            const int oC = output.sizeAt(1);
-            const int oH = output.sizeAt(2);
-            const int oW = output.sizeAt(3);
-
-            nd4j_debug("MKL-DNN is not used for pooling2d!\n", 0);
-
-            const Nd4jLong iStride0 = input.stridesOf()[0];
-            const Nd4jLong iStride1 = input.stridesOf()[1];
-            const Nd4jLong iStride2 = input.stridesOf()[2];
-            const Nd4jLong iStride3 = input.stridesOf()[3];
-            const Nd4jLong oStride0 = output.stridesOf()[0];
-            const Nd4jLong oStride1 = output.stridesOf()[1];
-            const Nd4jLong oStride2 = output.stridesOf()[2];
-            const Nd4jLong oStride3 = output.stridesOf()[3];
-
-            const Nd4jLong iStep2   = dH*iStride2;
-            const Nd4jLong iStep3   = dW*iStride3;
-            const int kProd         = kH*kW;
-
-            if(poolingMode == 0) {        // max
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong hstart, wstart, hend, wend;
-                    T *pIn;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int oh = 0; oh < oH; ++oh) {
-                                for (int ow = 0; ow < oW; ++ow) {
-
-                                    pIn = in + b * iStride0 + c * iStride1;
-
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
-
-                                    if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                    if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                    if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                    if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                                    hstart *= iStride2;
-                                    hend *= iStride2;
-                                    wstart *= iStride3;
-                                    wend *= iStride3;
-
-                                    T max = -DataTypeUtils::max<T>();
-
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                            T val = pIn[kh + kw];
-                                            if (val > max)
-                                                max = val;
-                                        }
-                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max;
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 1) {      // avg
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong hstart, wstart, hend, wend;
-                    T *pIn;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int oh = 0; oh < oH; ++oh) {
-                                for (int ow = 0; ow < oW; ++ow) {
-
-                                    pIn = in + b * iStride0 + c * iStride1;
-
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
-
-                                    if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                    if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                    if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                    if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                                    hstart *= iStride2;
-                                    hend *= iStride2;
-                                    wstart *= iStride3;
-                                    wend *= iStride3;
-
-                                    T sum = static_cast<T>(0.f);
-
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            sum += pIn[kh + kw];
-
-                                    if (extraParam0 == 0) {                     //Exclude padding
-                                        int a = (hend - hstart) / iStep2 + ((hend - hstart) % iStep2 == 0 ? 0 : 1);
-                                        int r = (wend - wstart) / iStep3 + ((wend - wstart) % iStep3 == 0 ? 0 : 1);
-                                        sum /= static_cast<T>(a * r);          //  Accounts for dilation
-                                    } else if (extraParam0 == 1)                  //Include padding
-                                        sum /= kProd;
-
-                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 2) {  // pnorm
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong hstart, wstart, hend, wend;
-                    T *pIn;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int oh = 0; oh < oH; ++oh) {
-                                for (int ow = 0; ow < oW; ++ow) {
-
-                                    pIn = in + b * iStride0 + c * iStride1;
-
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
-
-                                    if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                    if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                    if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                    if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                                    hstart *= iStride2;
-                                    hend *= iStride2;
-                                    wstart *= iStride3;
-                                    wend *= iStride3;
-
-                                    T sum = static_cast<T>(0.f);
-
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                            sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
-
-                                    sum = sd::math::nd4j_pow<T, T, T>(sum, static_cast<T>((T) 1.f) / extraParam0);
-
-                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-            else {
-                nd4j_printf("ConvolutionUtils::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw "";
-            }
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void pooling3d_(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            // input is  [bS, iC, iD, iH, iW]
-            // output is [bS, iC, oD, oH, oW]
-            T* out = output.bufferAsT<T>();
-            T* in  = const_cast<NDArray&>(input).bufferAsT<T>();
-
-            const int kDEff = kD + (kD-1)*(dD-1);
-            const int kHEff = kH + (kH-1)*(dH-1);
-            const int kWEff = kW + (kW-1)*(dW-1);
-
-            const int bS = input.sizeAt(0);
-            const int iC = input.sizeAt(1);
-            const int iD = input.sizeAt(2);
-            const int iH = input.sizeAt(3);
-            const int iW = input.sizeAt(4);
-            const int oC = output.sizeAt(1);
-            const int oD = output.sizeAt(2);
-            const int oH = output.sizeAt(3);
-            const int oW = output.sizeAt(4);
-
-            nd4j_debug("MKL-DNN is not used for pooling3d!\n", 0);
-
-            const Nd4jLong iStride0 = input.stridesOf()[0];
-            const Nd4jLong iStride1 = input.stridesOf()[1];
-            const Nd4jLong iStride2 = input.stridesOf()[2];
-            const Nd4jLong iStride3 = input.stridesOf()[3];
-            const Nd4jLong iStride4 = input.stridesOf()[4];
-            const Nd4jLong oStride0 = output.stridesOf()[0];
-            const Nd4jLong oStride1 = output.stridesOf()[1];
-            const Nd4jLong oStride2 = output.stridesOf()[2];
-            const Nd4jLong oStride3 = output.stridesOf()[3];
-            const Nd4jLong oStride4 = output.stridesOf()[4];
-            const Nd4jLong iStep2   = dD*iStride2;
-            const Nd4jLong iStep3   = dH*iStride3;
-            const Nd4jLong iStep4   = dW*iStride4;
-            const int kProd         = kD*kH*kW;
-
-            if(poolingMode == 0) {        // max
-                auto func = PRAGMA_THREADS_FOR_3D {
-                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
-                    T sum, *pIn;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int od = start_z; od < stop_z; od += inc_z) {
-                                for (int oh = 0; oh < oH; ++oh) {
-                                    for (int ow = 0; ow < oW; ++ow) {
-
-                                        pIn = in + b * iStride0 + c * iStride1;
-
-                                        dstart = od * sD - pD;
-                                        hstart = oh * sH - pH;
-                                        wstart = ow * sW - pW;
-                                        dend = dstart + kDEff;
-                                        hend = hstart + kHEff;
-                                        wend = wstart + kWEff;
-
-                                        if (dstart < 0)
-                                            dstart += dD * ((-dstart + dD - 1) / dD);
-                                        if (hstart < 0)
-                                            hstart += dH * ((-hstart + dH - 1) / dH);
-                                        if (wstart < 0)
-                                            wstart += dW * ((-wstart + dW - 1) / dW);
-                                        if (dend > iD)
-                                            dend -= dD * ((dend - iD + dD - 1) / dD);
-                                        if (hend > iH)
-                                            hend -= dH * ((hend - iH + dH - 1) / dH);
-                                        if (wend > iW)
-                                            wend -= dW * ((wend - iW + dW - 1) / dW);
-
-                                        dstart *= iStride2;
-                                        dend *= iStride2;
-                                        hstart *= iStride3;
-                                        hend *= iStride3;
-                                        wstart *= iStride4;
-                                        wend *= iStride4;
-
-                                        sum = -DataTypeUtils::max<T>();
-
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                    T val = pIn[kd + kh + kw];
-                                                    if (val > sum)
-                                                        sum = val;
-                                                }
-
-                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 1) {     // avg
-                auto func = PRAGMA_THREADS_FOR_3D {
-                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
-                    T sum, *pIn;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int od = start_z; od < stop_z; od += inc_z) {
-                                for (int oh = 0; oh < oH; ++oh) {
-                                    for (int ow = 0; ow < oW; ++ow) {
-
-                                        pIn = in + b * iStride0 + c * iStride1;
-
-                                        dstart = od * sD - pD;
-                                        hstart = oh * sH - pH;
-                                        wstart = ow * sW - pW;
-                                        dend = dstart + kDEff;
-                                        hend = hstart + kHEff;
-                                        wend = wstart + kWEff;
-
-                                        if (dstart < 0)
-                                            dstart += dD * ((-dstart + dD - 1) / dD);
-                                        if (hstart < 0)
-                                            hstart += dH * ((-hstart + dH - 1) / dH);
-                                        if (wstart < 0)
-                                            wstart += dW * ((-wstart + dW - 1) / dW);
-                                        if (dend > iD)
-                                            dend -= dD * ((dend - iD + dD - 1) / dD);
-                                        if (hend > iH)
-                                            hend -= dH * ((hend - iH + dH - 1) / dH);
-                                        if (wend > iW)
-                                            wend -= dW * ((wend - iW + dW - 1) / dW);
-
-                                        dstart *= iStride2;
-                                        dend *= iStride2;
-                                        hstart *= iStride3;
-                                        hend *= iStride3;
-                                        wstart *= iStride4;
-                                        wend *= iStride4;
-
-                                        sum = static_cast<T>(0.);
-
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    sum += pIn[kd + kh + kw];
-
-                                        if (extraParam0 == 0)         //Exclude padding
-                                            sum /= sd::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(iStep2)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(iStep3)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(iStep4));   //Accounts for dilation
-                                        else if (extraParam0 == 1)    //Include padding
-                                            sum /= kProd;
-
-                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 2) {  // pnorm
-                auto func = PRAGMA_THREADS_FOR_3D {
-                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
-                    T sum, *pIn;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int od = start_z; od < stop_z; od += inc_z) {
-                                for (int oh = 0; oh < oH; ++oh) {
-                                    for (int ow = 0; ow < oW; ++ow) {
-
-                                        pIn = in + b * iStride0 + c * iStride1;
-
-                                        dstart = od * sD - pD;
-                                        hstart = oh * sH - pH;
-                                        wstart = ow * sW - pW;
-                                        dend = dstart + kDEff;
-                                        hend = hstart + kHEff;
-                                        wend = wstart + kWEff;
-
-                                        if (dstart < 0)
-                                            dstart += dD * ((-dstart + dD - 1) / dD);
-                                        if (hstart < 0)
-                                            hstart += dH * ((-hstart + dH - 1) / dH);
-                                        if (wstart < 0)
-                                            wstart += dW * ((-wstart + dW - 1) / dW);
-                                        if (dend > iD)
-                                            dend -= dD * ((dend - iD + dD - 1) / dD);
-                                        if (hend > iH)
-                                            hend -= dH * ((hend - iH + dH - 1) / dH);
-                                        if (wend > iW)
-                                            wend -= dW * ((wend - iW + dW - 1) / dW);
-
-                                        dstart *= iStride2;
-                                        dend *= iStride2;
-                                        hstart *= iStride3;
-                                        hend *= iStride3;
-                                        wstart *= iStride4;
-                                        wend *= iStride4;
-
-                                        sum = static_cast<T>(0.);
-
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                    sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
-
-                                        sum = sd::math::nd4j_pow<T, T, T>(sum, (T) 1.f / extraParam0);
-
-                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
-            }
-            else {
-                nd4j_printf("ConvolutionUtils::pooling3d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw std::runtime_error("Incorrect poooling3d mode");
-            }
-        }
-
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void pooling2dBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            // input [bS, iC, iH, iW]
-            // gradI [bS, iC, iH, iW] -> gradI is output in this function
-            // gradO [bS, iC, oH, oW]
-
-            // initial zeroing of gradI
-            gradI.nullify();
-
-            T* in = const_cast<NDArray&>(input).bufferAsT<T>();
-            T* gO = const_cast<NDArray&>(gradO).bufferAsT<T>();
-            T* gI = gradI.bufferAsT<T>();
-
-            const int kHEff = kH + (kH-1)*(dH-1);
-            const int kWEff = kW + (kW-1)*(dW-1);
-
-            const int bS = gradI.sizeAt(0);
-            const int iC = gradI.sizeAt(1);
-            const int iH = gradI.sizeAt(2);
-            const int iW = gradI.sizeAt(3);
-            const int oC = gradO.sizeAt(1);
-            const int oH = gradO.sizeAt(2);
-            const int oW = gradO.sizeAt(3);
-
-            nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0);
-
-            const Nd4jLong iStride0  = input.stridesOf()[0];
-            const Nd4jLong iStride1  = input.stridesOf()[1];
-            const Nd4jLong iStride2  = input.stridesOf()[2];
-            const Nd4jLong iStride3  = input.stridesOf()[3];
-            const Nd4jLong gIStride0 = gradI.stridesOf()[0];
-            const Nd4jLong gIStride1 = gradI.stridesOf()[1];
-            const Nd4jLong gIStride2 = gradI.stridesOf()[2];
-            const Nd4jLong gIStride3 = gradI.stridesOf()[3];
-            const Nd4jLong oStride0  = gradO.stridesOf()[0];
-            const Nd4jLong oStride1  = gradO.stridesOf()[1];
-            const Nd4jLong oStride2  = gradO.stridesOf()[2];
-            const Nd4jLong oStride3  = gradO.stridesOf()[3];
-            const Nd4jLong iStep2    = dH*iStride2;
-            const Nd4jLong iStep3    = dW*iStride3;
-            const Nd4jLong gIStep2   = dH*gIStride2;
-            const Nd4jLong gIStep3   = dW*gIStride3;
-            const int      kProd     = kH*kW;
-
-            const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3;
-
-            if(poolingMode == 0) {        // max
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
-                    T sum, valO, *pIn, *pgI;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int oh = 0; oh < oH; ++oh) {
-                                for (int ow = 0; ow < oW; ++ow) {
-
-                                    pIn = in + b * iStride0 + c * iStride1;
-
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
-
-                                    if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                    if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                    if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                    if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                                    sum = -DataTypeUtils::max<T>();
-                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
-
-                                    if (sameStrides) {
-
-                                        hstart *= iStride2;
-                                        hend *= iStride2;
-                                        wstart *= iStride3;
-                                        wend *= iStride3;
-
-                                        // we set these to default values
-                                        maxKH = hstart;
-                                        maxKW = wstart;
-
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
-                                                T valIn = pIn[kh + kw];
-                                                if (valIn > sum) {
-                                                    sum = valIn;
-                                                    maxKH = kh;
-                                                    maxKW = kw;
-                                                }
-                                            }
-                                        gI[pIn - in + maxKH + maxKW] += valO;
-                                    } else {
-
-                                        // we set these to default values
-                                        maxKH = hstart;
-                                        maxKW = wstart;
-
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                T valIn = pIn[kh * iStride2 + kw * iStride3];
-                                                if (valIn > sum) {
-                                                    sum = valIn;
-                                                    maxKH = kh;
-                                                    maxKW = kw;
-                                                }
-                                            }
-
-                                        gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 1) {     // avg
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
-                    T sum, valO, *pIn, *pgI;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int oh = 0; oh < oH; ++oh) {
-                                for (int ow = 0; ow < oW; ++ow) {
-
-                                    pgI = gI + b * gIStride0 + c * gIStride1;
-
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
-
-                                    if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) /
-                                                        dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                    if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) /
-                                                        dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                    if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) /
-                                                      dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                    if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) /
-                                                      dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                                    hstart *= gIStride2;
-                                    hend *= gIStride2;
-                                    wstart *= gIStride3;
-                                    wend *= gIStride3;
-
-                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
-
-                                    if ((int) extraParam0 == 0)         //Exclude padding
-                                        valO /= static_cast<T>(sd::math::nd4j_ceil<double, T>(
-                                                static_cast<double>(hend - hstart) / static_cast<double>(gIStep2))) *
-                                                static_cast<T>(sd::math::nd4j_ceil<double, T>(
-                                                        static_cast<double>(wend - wstart) /
-                                                        static_cast<double>(gIStep3)));   //Accounts for dilation
-                                    else if ((int) extraParam0 == 1)    //Include padding
-                                        valO /= kProd;
-
-                                    for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
-                                        for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
-                                            pgI[kh + kw] += valO;
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 2) {  // pnorm
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
-                    T sum, valO, *pIn, *pgI;
-
-                    for (int b = start_x; b < stop_x; b += inc_x) {
-                        for (int c = start_y; c < stop_y; c += inc_y) {
-                            for (int oh = 0; oh < oH; ++oh) {
-                                for (int ow = 0; ow < oW; ++ow) {
-
-                                    pIn = in + b * iStride0 + c * iStride1;
-                                    pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
-
-                                    hstart = oh * sH - pH;
-                                    wstart = ow * sW - pW;
-                                    hend = hstart + kHEff;
-                                    wend = wstart + kWEff;
-
-                                    if (hstart < 0)
-                                        hstart += dH * ((-hstart + dH - 1) /
-                                                        dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
-                                    if (wstart < 0)
-                                        wstart += dW * ((-wstart + dW - 1) /
-                                                        dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
-                                    if (hend > iH)
-                                        hend -= dH * ((hend - iH + dH - 1) /
-                                                      dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
-                                    if (wend > iW)
-                                        wend -= dW * ((wend - iW + dW - 1) /
-                                                      dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
-
-                                    sum = static_cast<T>(0.f);
-                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
-
-                                    if (sameStrides) {
-
-                                        hstart *= iStride2;
-                                        hend *= iStride2;
-                                        wstart *= iStride3;
-                                        wend *= iStride3;
-
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                                sum += sd::math::nd4j_pow<T, T, T>(
-                                                        sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
-
-                                        valO *= sd::math::nd4j_pow<T, T, T>(sum,
-                                                                              ((T) 1. - extraParam0) / extraParam0);
-
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
-                                                pgI[kh + kw] += valO * sd::math::nd4j_pow<T, T, T>(
-                                                        sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) *
-                                                                sd::math::nd4j_sgn<T, T>(pIn[kh + kw]);
-                                    } else {
-
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                                sum += sd::math::nd4j_pow<T, T, T>(
-                                                        sd::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]),
-                                                        extraParam0);
-
-                                        valO *= sd::math::nd4j_pow<T, T, T>(sum,
-                                                                              ((T) 1. - extraParam0) / extraParam0);
-
-                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
-                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                const auto inVal = pIn[kh * iStride2 + kw * iStride3];
-                                                pgI[kh * gIStride2 + kw * gIStride3] += valO *
-                                                                                        sd::math::nd4j_pow<T, T, T>(
-                                                                                                sd::math::nd4j_abs<T>(
-                                                                                                        inVal),
-                                                                                                extraParam0 - 1.f) *
-                                                                                        sd::math::nd4j_sgn<T, T>(
-                                                                                                inVal);
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-            else {
-                nd4j_printf("ConvolutionUtils::pooling2dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw std::runtime_error("Incorrect pooling2dBP mode");
-            }
-        }
-
-//////////////////////////////////////////////////////////////////////////
-        template <typename T>
-        static void pooling3dBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            // input [bS, iC, iD, iH, iW]
-            // gradI [bS, iC, iD, iH, iW] -> gradI is output in this function
-            // gradO [bS, iC, oD, oH, oW]
-
-            // initial zeroing of gradI
-            gradI.nullify();
-
-            T* in = const_cast<NDArray&>(input).bufferAsT<T>();
-            T* gO = const_cast<NDArray&>(gradO).bufferAsT<T>();
-            T* gI = gradI.bufferAsT<T>();
-
-            const int kDEff = kD + (kD-1)*(dD-1);
-            const int kHEff = kH + (kH-1)*(dH-1);
-            const int kWEff = kW + (kW-1)*(dW-1);
-
-            const int bS = gradI.sizeAt(0);
-            const int iC = gradI.sizeAt(1);
-            const int iD = gradI.sizeAt(2);
-            const int iH = gradI.sizeAt(3);
-            const int iW = gradI.sizeAt(4);
-            const int oC = gradO.sizeAt(1);
-            const int oD = gradO.sizeAt(2);
-            const int oH = gradO.sizeAt(3);
-            const int oW = gradO.sizeAt(4);
-
-            nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0);
-
-            const Nd4jLong iStride0  = input.stridesOf()[0];
-            const Nd4jLong iStride1  = input.stridesOf()[1];
-            const Nd4jLong iStride2  = input.stridesOf()[2];
-            const Nd4jLong iStride3  = input.stridesOf()[3];
-            const Nd4jLong iStride4  = input.stridesOf()[4];
-            const Nd4jLong gIStride0 = gradI.stridesOf()[0];
-            const Nd4jLong gIStride1 = gradI.stridesOf()[1];
-            const Nd4jLong gIStride2 = gradI.stridesOf()[2];
-            const Nd4jLong gIStride3 = gradI.stridesOf()[3];
-            const Nd4jLong gIStride4 = gradI.stridesOf()[4];
-            const Nd4jLong oStride0 = gradO.stridesOf()[0];
-            const Nd4jLong oStride1 = gradO.stridesOf()[1];
-            const Nd4jLong oStride2 = gradO.stridesOf()[2];
-            const Nd4jLong oStride3 = gradO.stridesOf()[3];
-            const Nd4jLong oStride4 = gradO.stridesOf()[4];
-            const Nd4jLong iStep2   = dD*iStride2;
-            const Nd4jLong iStep3   = dH*iStride3;
-            const Nd4jLong iStep4   = dW*iStride4;
-            const Nd4jLong gIStep2  = dD*gIStride2;
-            const Nd4jLong gIStep3  = dH*gIStride3;
-            const Nd4jLong gIStep4  = dW*gIStride4;
-            const int      kProd    = kD*kH*kW;
-
-            const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4;
-
-            if(poolingMode == 0) {        // max
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
-                    T sum, valO, *pIn, *pgI;
-
-                    for (int b = start_x; b < stop_x; b++) {
-                        for (int c = start_y; c < stop_y; c++) {
-                            for (int od = 0; od < oD; od++) {
-                                for (int oh = 0; oh < oH; ++oh) {
-                                    for (int ow = 0; ow < oW; ++ow) {
-
-                                        pIn = in + b * iStride0 + c * iStride1;
-
-                                        dstart = od * sD - pD;
-                                        hstart = oh * sH - pH;
-                                        wstart = ow * sW - pW;
-                                        dend = dstart + kDEff;
-                                        hend = hstart + kHEff;
-                                        wend = wstart + kWEff;
-
-                                        if (dstart < 0)
-                                            dstart += dD * ((-dstart + dD - 1) / dD);
-                                        if (hstart < 0)
-                                            hstart += dH * ((-hstart + dH - 1) / dH);
-                                        if (wstart < 0)
-                                            wstart += dW * ((-wstart + dW - 1) / dW);
-                                        if (dend > iD)
-                                            dend -= dD * ((dend - iD + dD - 1) / dD);
-                                        if (hend > iH)
-                                            hend -= dH * ((hend - iH + dH - 1) / dH);
-                                        if (wend > iW)
-                                            wend -= dW * ((wend - iW + dW - 1) / dW);
-
-                                        sum = -DataTypeUtils::max<T>();
-                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
-
-                                        if (sameStrides) {
-
-                                            dstart *= iStride2;
-                                            dend *= iStride2;
-                                            hstart *= iStride3;
-                                            hend *= iStride3;
-                                            wstart *= iStride4;
-                                            wend *= iStride4;
-
-                                            maxKD = dstart;
-                                            maxKH = hstart;
-                                            maxKW = wstart;
-
-                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
-                                                        T valIn = pIn[kd + kh + kw];
-                                                        if (valIn > sum) {
-                                                            sum = valIn;
-                                                            maxKD = kd;
-                                                            maxKH = kh;
-                                                            maxKW = kw;
-                                                        }
-                                                    }
-                                            gI[pIn - in + maxKD + maxKH + maxKW] += valO;
-                                        } else {
-
-                                            // we set these to default values
-                                            maxKH = hstart;
-                                            maxKW = wstart;
-                                            maxKD = dstart;
-
-                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                        T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
-                                                        if (valIn > sum) {
-                                                            sum = valIn;
-                                                            maxKD = kd;
-                                                            maxKH = kh;
-                                                            maxKW = kw;
-                                                        }
-                                                    }
-
-                                            gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 1) {     // avg
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
-                    T sum, valO, *pIn, *pgI;
-
-                    for (int b = start_x; b < stop_x; b++) {
-                        for (int c = start_y; c < stop_y; c++) {
-                            for (int od = 0; od < oD; od++) {
-                                for (int oh = 0; oh < oH; ++oh) {
-                                    for (int ow = 0; ow < oW; ++ow) {
-
-                                        pgI = gI + b * gIStride0 + c * gIStride1;
-
-                                        dstart = od * sD - pD;
-                                        hstart = oh * sH - pH;
-                                        wstart = ow * sW - pW;
-                                        dend = dstart + kDEff;
-                                        hend = hstart + kHEff;
-                                        wend = wstart + kWEff;
-
-                                        if (dstart < 0)
-                                            dstart += dD * ((-dstart + dD - 1) / dD);
-                                        if (hstart < 0)
-                                            hstart += dH * ((-hstart + dH - 1) / dH);
-                                        if (wstart < 0)
-                                            wstart += dW * ((-wstart + dW - 1) / dW);
-                                        if (dend > iD)
-                                            dend -= dD * ((dend - iD + dD - 1) / dD);
-                                        if (hend > iH)
-                                            hend -= dH * ((hend - iH + dH - 1) / dH);
-                                        if (wend > iW)
-                                            wend -= dW * ((wend - iW + dW - 1) / dW);
-
-                                        dstart *= gIStride2;
-                                        dend *= gIStride2;
-                                        hstart *= gIStride3;
-                                        hend *= gIStride3;
-                                        wstart *= gIStride4;
-                                        wend *= gIStride4;
-
-                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
-
-                                        if (extraParam0 == 0)         //Exclude padding
-                                            valO /= sd::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(gIStep2)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(gIStep3)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
-                                        else if (extraParam0 == 1)    //Include padding
-                                            valO /= kProd;
-
-                                        for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
-                                            for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
-                                                for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
-                                                    pgI[kd + kh + kw] += valO;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-/*************************************************************************/
-            else if(poolingMode == 2) {  // pnorm
-                auto func = PRAGMA_THREADS_FOR_2D {
-                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
-                    T sum, valO, *pIn, *pgI;
-
-                    for (int b = start_x; b < stop_x; b++) {
-                        for (int c = start_y; c < stop_y; c++) {
-                            for (int od = 0; od < oD; od++) {
-                                for (int oh = 0; oh < oH; ++oh) {
-                                    for (int ow = 0; ow < oW; ++ow) {
-
-                                        pIn = in + b * iStride0 + c * iStride1;
-                                        pgI = gI + (pIn - in);
-
-                                        dstart = od * sD - pD;
-                                        hstart = oh * sH - pH;
-                                        wstart = ow * sW - pW;
-                                        dend = dstart + kDEff;
-                                        hend = hstart + kHEff;
-                                        wend = wstart + kWEff;
-
-                                        if (dstart < 0)
-                                            dstart += dD * ((-dstart + dD - 1) / dD);
-                                        if (hstart < 0)
-                                            hstart += dH * ((-hstart + dH - 1) / dH);
-                                        if (wstart < 0)
-                                            wstart += dW * ((-wstart + dW - 1) / dW);
-                                        if (dend > iD)
-                                            dend -= dD * ((dend - iD + dD - 1) / dD);
-                                        if (hend > iH)
-                                            hend -= dH * ((hend - iH + dH - 1) / dH);
-                                        if (wend > iW)
-                                            wend -= dW * ((wend - iW + dW - 1) / dW);
-
-                                        sum = static_cast<T>(0.);
-                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
-
-                                        if (sameStrides) {
-
-                                            dstart *= iStride2;
-                                            dend *= iStride2;
-                                            hstart *= iStride3;
-                                            hend *= iStride3;
-                                            wstart *= iStride4;
-                                            wend *= iStride4;
-
-                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                        sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
-
-                                            valO *= sd::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
-
-                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
-                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
-                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
-                                                        pgI[kd + kh + kw] += valO * sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * sd::math::nd4j_sgn<T, T>(pIn[kd + kh + kw]);
-                                        } else {
-                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW)
-                                                        sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
-
-                                            valO *= sd::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
-
-                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
-                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
-                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
-                                                        const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
-                                                        pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * sd::math::nd4j_sgn<T, T>(inVal);
-                                                    }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                };
-
-                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
-            }
-            else {
-                nd4j_printf("ConvolutionUtils::pooling3dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
-                throw "";
-            }
-        }
-
-
-
-
-        void ConvolutionUtils::conv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::conv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::depthwiseConv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::depthwiseConv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::sconv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::vol2col(sd::graph::Context& block, const NDArray& volume, NDArray& columns, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-            BUILD_SINGLE_SELECTOR(volume.dataType(), vol2col_, (volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::col2vol(sd::graph::Context& block, const NDArray& columns, NDArray& volume, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-            BUILD_SINGLE_SELECTOR(volume.dataType(), col2vol_, (columns, volume, sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::upsampling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
-            BUILD_SINGLE_SELECTOR(input.dataType(), upsampling2d_, (input, output, factorH, factorW, isNCHW), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::upsampling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
-            BUILD_SINGLE_SELECTOR(input.dataType(), upsampling3d_, (input, output, factorD, factorH, factorW, isNCDHW), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::upsampling2dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
-            BUILD_SINGLE_SELECTOR(gradO.dataType(), upsampling2dBP_, (gradO, gradI, isNCHW), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::upsampling3dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
-            BUILD_SINGLE_SELECTOR(gradO.dataType(), upsampling3dBP_, (gradO, gradI, isNCHW), FLOAT_TYPES);
-        }
-
-
-
-        void ConvolutionUtils::pooling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
-            BUILD_SINGLE_SELECTOR(input.dataType(), pooling2d_, (block, input, output, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::pooling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            BUILD_SINGLE_SELECTOR(input.dataType(), pooling3d_, (block, input, output, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::pooling2dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            BUILD_SINGLE_SELECTOR(input.dataType(), pooling2dBP_, (block, input, gradO, gradI, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-        }
-        void ConvolutionUtils::pooling3dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-            BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dBP_, (block, input, gradO, gradI, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-        }
-    }
-}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_col2vol.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_col2vol.cpp
new file mode 100644
index 000000000..c9cae504a
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_col2vol.cpp
@@ -0,0 +1,143 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+// [bS, iC, kD, kH, kW, oD, oH, oW] is de-convoluted to [bS, iC, iD, iH, iW]
+template <typename T>
+static void col2vol_(const NDArray& columns, NDArray& volume, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+            // initial zeroing of volume content
+            volume.nullify();
+
+            const int bS = volume.sizeAt(0);
+            const int iC = volume.sizeAt(1);
+            const int iD = volume.sizeAt(2);
+            const int iH = volume.sizeAt(3);
+            const int iW = volume.sizeAt(4);
+            const int kD = columns.sizeAt(2);
+            const int kH = columns.sizeAt(3);
+            const int kW = columns.sizeAt(4);
+            const int oD = columns.sizeAt(5);
+            const int oH = columns.sizeAt(6);
+            const int oW = columns.sizeAt(7);
+            const Nd4jLong colStride0 = columns.stridesOf()[0];
+            const Nd4jLong colStride1 = columns.stridesOf()[1];
+            const Nd4jLong colStride2 = columns.stridesOf()[2];
+            const Nd4jLong colStride3 = columns.stridesOf()[3];
+            const Nd4jLong colStride4 = columns.stridesOf()[4];
+            const Nd4jLong colStride5 = columns.stridesOf()[5];
+            const Nd4jLong colStride6 = columns.stridesOf()[6];
+            const Nd4jLong colStride7 = columns.stridesOf()[7];
+            const Nd4jLong volStride0 = volume.stridesOf()[0];
+            const Nd4jLong volStride1 = volume.stridesOf()[1];
+            const Nd4jLong volStride2 = volume.stridesOf()[2];
+            const Nd4jLong volStride3 = volume.stridesOf()[3];
+            const Nd4jLong volStride4 = volume.stridesOf()[4];
+
+            T* volBuff = volume.bufferAsT<T>();
+            T* colBuff = const_cast<NDArray&>(columns).bufferAsT<T>();
+
+
+            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
+
+                auto func = PRAGMA_THREADS_FOR {
+                    T* col, *vol;
+                    int volDep, volRow, volCol;
+
+                    for (int b = start; b < stop; b++) {
+                        for (int c = 0; c < iC; c++) {
+                            for (int kDep = 0; kDep < kD; ++kDep) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+                                        for (int colD = 0; colD < oD; ++colD) {
+                                            for (int colH = 0; colH < oH; ++colH) {
+                                                for (int colW = 0; colW < oW; ++colW) {
+
+                                                    volDep = -pD + kDep * dD + colD * sD;
+                                                    volRow = -pH + kRow * dH + colH * sH;
+                                                    volCol = -pW + kCol * dW + colW * sW;
+
+                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
+                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *vol += *col;
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_tad(func, 0, bS);
+
+            } else {
+
+                auto func = PRAGMA_THREADS_FOR {
+                    T* col, *vol;
+                    int volDep, volRow, volCol;
+
+                    for (int b = start; b < stop; b++) {
+                        for (int colD = 0; colD < oD; colD++) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
+                                    for (int c = 0; c < iC; ++c) {
+                                        for (int kDep = 0; kDep < kD; ++kDep) {
+                                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                                for (int kCol = 0; kCol < kW; ++kCol) {
+
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
+
+                                                    if (static_cast<unsigned>(volDep) < static_cast<unsigned>(iD) && static_cast<unsigned>(volRow) < static_cast<unsigned>(iH) && static_cast<unsigned>(volCol) < static_cast<unsigned>(iW)) {
+                                                        col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *vol += *col;
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_tad(func, 0, bS);
+            }
+        }
+
+void ConvolutionUtils::col2vol(sd::graph::Context& block, const NDArray& columns, NDArray& volume, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+            BUILD_SINGLE_SELECTOR(volume.dataType(), col2vol_, (columns, volume, sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_conv2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_conv2d.cpp
new file mode 100644
index 000000000..45e66651c
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_conv2d.cpp
@@ -0,0 +1,107 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include<ops/declarable/helpers/addBias.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void conv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+            // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+            // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+            // bias    [oC]
+            // output  [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+
+            // kH  filter(kernel) height
+            // kW  filter(kernel) width
+            // sH  strides height
+            // sW  strides width
+            // pH  paddings height
+            // pW  paddings width
+            // dH  dilations height
+            // dW  dilations width
+            // paddingMode 0-VALID, 1-SAME
+            // isNCHW      1-NCHW,  0-NHWC
+
+            int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+            int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+            ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+            nd4j_debug("MKL-DNN is not used for conv2d!\n", 0);
+
+            std::vector<int> permutForOutput;
+
+            if(isNCHW)
+                permutForOutput = {0, 3, 1, 2};                                             // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
+            else
+                input = new NDArray(input->permute({0, 3, 1, 2}));                         // [bS, iH, iW, iC] -> [bS, iC, iH, iW] if NHWC
+
+            std::vector<int> wAxes;
+            if(0 == wFormat)
+                wAxes = {0, 1, 2};
+            else if(1 == wFormat)
+                wAxes = {2, 3, 1};
+            else
+                wAxes = {1, 2, 3};
+
+            NDArray col('c', {bS, oH, oW, kH, kW, iC}, input->dataType(), input->getContext());
+            NDArray colP = col.permute({0, 5, 3, 4, 1, 2});            // {bS, iC, kH, kW, oH, oW}
+            NDArray mmulResult('f', {bS*oH*oW, oC}, output->dataType(), output->getContext());
+
+            //----- calculation of output -----//
+            auto ctx = block.launchContext();
+            helpers::im2col(*ctx, *input, colP, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+            MmulHelper::tensorDot(&col, weights, &mmulResult, {3,4,5}, wAxes, {}); // [bS, oH, oW, kH, kW, iC] x [kH, kW, iC, oC] = [bS, oH, oW, oC]
+
+            //----- assign outTemp to output  -----//
+            if(isNCHW) {
+                mmulResult.reshapei({bS, oH, oW, oC});
+                mmulResult.permutei(permutForOutput);
+            }
+            output->assign(mmulResult);
+
+            //----- add biases if required -----//
+            if(bias)
+                // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+                helpers::addBias(block, *output, *bias, *output, isNCHW);
+
+            if(!isNCHW)
+                delete input;
+
+        }
+
+void ConvolutionUtils::conv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_conv2dBP.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_conv2dBP.cpp
new file mode 100644
index 000000000..6a01a4a4d
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_conv2dBP.cpp
@@ -0,0 +1,127 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void conv2dBP_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+            // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+            // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+            // bias    [oC]
+            // gradO   [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
+
+            // gradI    [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+            // gradW    [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+            // gradB    [oC]
+
+            // kH         filter(kernel) height
+            // kW         filter(kernel) width
+            // sH         strides height
+            // sW         strides width
+            // pH         paddings height
+            // pW         paddings width
+            // dH         dilations height
+            // dW         dilations width
+            // paddingMode 0-VALID, 1-SAME
+            // isNCHW      0-NHWC, 1-NCHW
+
+            int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+            int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+            ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+            nd4j_debug("MKL-DNN is not used for conv2d_bp!\n", 0);
+
+            std::vector<int> gradOaxesForDot;
+
+            if(!isNCHW) {
+                gradOaxesForDot  = {0, 1, 2};                                           // bS, oH, oW
+                input = new NDArray(input->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
+                gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
+            } else {
+                gradOaxesForDot  = {0, 2, 3};                                           // bS, oH, oW
+            }
+
+            std::vector<int> wPermut, colPermut;
+
+            if(0 == wFormat) {
+                wPermut   = {2, 0, 1, 3};
+                colPermut = {2, 3, 1, 0, 4, 5};
+            }
+            else if(1 == wFormat) {
+                wPermut   = {1, 2, 3, 0};
+                colPermut = {1, 2, 3, 0, 4, 5};
+            }
+            else {
+                wPermut   = {3, 1, 2, 0};
+                colPermut = {2, 3, 1, 0, 4, 5};
+            }
+
+            NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
+
+            // ----- calculation of gradW ----- //
+            if(gradW) {
+                auto ctx = block.launchContext();
+                helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+                sd::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, wPermut);       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
+            }
+
+            // ----- calculation of gradB ----- //
+            if(gradB) {
+                NDArray* gradBR = gradB;
+                if(gradB->rankOf() == 2)
+                    gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
+                gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot);                          // sum over bS, oH, oW
+                if(gradBR != gradB)
+                    delete gradBR;
+            }
+
+            //----- calculation of gradI -----//
+            // [kH, kW, iC, oC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+            // [oC, iC, kH, kW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, bS, oH, oW]
+            // [oC, kH, kW, iC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+            sd::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, colPermut);
+
+            helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
+
+            if(!isNCHW) {
+                delete input;
+                delete gradI;
+            }
+        }
+
+void ConvolutionUtils::conv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_depthwiseConv2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_depthwiseConv2d.cpp
new file mode 100644
index 000000000..fa86dbd6c
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_depthwiseConv2d.cpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include<ops/declarable/helpers/addBias.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include <array/NDArrayFactory.h>
+#include <helpers/MmulHelper.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void depthwiseConv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+            // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+            // weights   [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+            // bias      [oC] = iC*mC
+            // output    [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
+
+            // kH           filter(kernel) height
+            // kW           filter(kernel) width
+            // sH           strides height
+            // sW           strides width
+            // pH           paddings height
+            // pW           paddings width
+            // dH           dilations height
+            // dW           dilations width
+            // paddingMode  0-VALID, 1-SAME
+            // isNCHW       0-NCHW,  1-NHWC
+
+            int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+            int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+            mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+            std::vector<std::vector<Nd4jLong>> modifColumns = {{1,0,4,5,2,3}, {iC,bS*oH*oW,kH*kW}};  // [bS,iC,kH,kW,oH,oW] -> [iC,bS,oH,oW,kH,kW] -> [iC,bS*oH*oW,kH*kW]
+            std::vector<std::vector<Nd4jLong>> modifOutput, modifWeights;
+            std::vector<Nd4jLong> outReShape;
+
+            if(!isNCHW) {
+                outReShape = {bS, oH, oW, iC, mC};                                              // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
+                modifOutput = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+                input = new NDArray(input->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
+            }
+            else {
+                outReShape = {bS, iC, mC, oH, oW};                                              // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
+                modifOutput = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+            }
+
+            if(0 == wFormat)
+                modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
+            else if(1 == wFormat)
+                modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
+            else
+                modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
+
+            if(paddingMode == 1)                       // SAME
+                ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
+
+            NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
+            NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
+
+            helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+            MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, modifWeights, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
+
+            if(bias)
+                // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+                helpers::addBias(block, *output, *bias, *output, isNCHW);
+
+            if(!isNCHW)
+                delete input;
+        }
+
+void ConvolutionUtils::depthwiseConv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_depthwiseConv2dBP.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_depthwiseConv2dBP.cpp
new file mode 100644
index 000000000..7c0d933e2
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_depthwiseConv2dBP.cpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include <helpers/MmulHelper.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+            // input    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
+            // weights  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+            // bias     [oC] = [iC*mC]
+            // gradO    [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
+            // gradI    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
+            // gradW    [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+            // gradB    [oC]
+
+            //  kH          filter(kernel) height
+            //  kW          filter(kernel) width
+            //  sH          strides height
+            //  sW          strides width
+            //  pH          paddings height
+            //  pW          paddings width
+            //  dH          dilations height
+            //  dW          dilations width
+            //  paddingMode 0-VALID, 1-SAME
+            //  isNCHW      0-NHWC, 1-NCHW
+
+            int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+            int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+            mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+            std::vector<std::vector<Nd4jLong>> modifColumns = {{1,2,3,0,4,5}, {iC, kH*kW, bS*oH*oW}};      // [bS,iC,kH,kW,oH,oW] -> [iC, kH*kW, bS*oH*oW]
+            std::vector<std::vector<Nd4jLong>> modifGradO1, modifGradO2, modifWeights;
+            std::vector<Nd4jLong> gradOreShape;
+
+            if(!isNCHW) {
+                gradOreShape = {bS, oH, oW, iC, mC};                                            // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
+                modifGradO1 = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+                modifGradO2 = {{3,0,1,2},{iC, mC, bS*oH*oW}};                                   // [bS,oH,oW,iC*mC] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
+                input = new NDArray(input->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
+                gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
+            }
+            else {
+                gradOreShape = {bS, iC, mC, oH, oW};                                            // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
+                modifGradO1 = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+                modifGradO2 = {{1,0,2,3},{iC, mC, bS*oH*oW}};                                   // [bS,iC*mC,oH,oW] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
+            }
+
+            if(0 == wFormat)
+                modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
+            else if(1 == wFormat)
+                modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
+            else
+                modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
+
+            if(paddingMode == 1)                       // SAME
+                ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
+
+            NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
+            NDArray gradOreshaped = gradO->reshape(gradO->ordering(), gradOreShape);
+
+            // ----- calculation of gradW and gradB ----- //
+
+            helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+            sd::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, modifWeights);  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
+
+            // ----- calculation of gradB ----- //
+            if(gradB) {
+                NDArray* gradBR = gradB;
+                if(gradB->rankOf() == 2)
+                    gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}, false));
+                gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1});                      // sum over bS, oH, oW
+
+                if(gradBR != gradB)
+                    delete gradBR;
+            }
+
+            //----- calculation of gradI -----//
+            sd::MmulHelper::tensorDot(weights, gradO, &columns, modifWeights, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
+            helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
+
+            if(!isNCHW) {
+                delete input;
+                delete gradI;
+            }
+        }
+
+void ConvolutionUtils::depthwiseConv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+        }
+
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling2d.cpp
new file mode 100644
index 000000000..26dc4f99e
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling2d.cpp
@@ -0,0 +1,223 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+        template <typename T>
+        static void pooling2d_(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            // input is  [bS, iC, iH, iW]
+            // output is [bS, iC, oH, oW]
+            T* out = output.bufferAsT<T>();
+            T* in  = const_cast<NDArray&>(input).bufferAsT<T>();
+
+            const int kHEff = kH + (kH-1)*(dH-1);
+            const int kWEff = kW + (kW-1)*(dW-1);
+
+            const int bS = input.sizeAt(0);
+            const int iC = input.sizeAt(1);
+            const int iH = input.sizeAt(2);
+            const int iW = input.sizeAt(3);
+            const int oC = output.sizeAt(1);
+            const int oH = output.sizeAt(2);
+            const int oW = output.sizeAt(3);
+
+            nd4j_debug("MKL-DNN is not used for pooling2d!\n", 0);
+
+            const Nd4jLong iStride0 = input.stridesOf()[0];
+            const Nd4jLong iStride1 = input.stridesOf()[1];
+            const Nd4jLong iStride2 = input.stridesOf()[2];
+            const Nd4jLong iStride3 = input.stridesOf()[3];
+            const Nd4jLong oStride0 = output.stridesOf()[0];
+            const Nd4jLong oStride1 = output.stridesOf()[1];
+            const Nd4jLong oStride2 = output.stridesOf()[2];
+            const Nd4jLong oStride3 = output.stridesOf()[3];
+
+            const Nd4jLong iStep2   = dH*iStride2;
+            const Nd4jLong iStep3   = dW*iStride3;
+            const int kProd         = kH*kW;
+
+            if(poolingMode == 0) {        // max
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
+
+                                    pIn = in + b * iStride0 + c * iStride1;
+
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
+
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
+
+                                    T max = -DataTypeUtils::max<T>();
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                            T val = pIn[kh + kw];
+                                            if (val > max)
+                                                max = val;
+                                        }
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = max;
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 1) {      // avg
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
+
+                                    pIn = in + b * iStride0 + c * iStride1;
+
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
+
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
+
+                                    T sum = static_cast<T>(0.f);
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += pIn[kh + kw];
+
+                                    if (extraParam0 == 0) {                     //Exclude padding
+                                        int a = (hend - hstart) / iStep2 + ((hend - hstart) % iStep2 == 0 ? 0 : 1);
+                                        int r = (wend - wstart) / iStep3 + ((wend - wstart) % iStep3 == 0 ? 0 : 1);
+                                        sum /= static_cast<T>(a * r);          //  Accounts for dilation
+                                    } else if (extraParam0 == 1)                  //Include padding
+                                        sum /= kProd;
+
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 2) {  // pnorm
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend;
+                    T *pIn;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
+
+                                    pIn = in + b * iStride0 + c * iStride1;
+
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
+
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+
+                                    hstart *= iStride2;
+                                    hend *= iStride2;
+                                    wstart *= iStride3;
+                                    wend *= iStride3;
+
+                                    T sum = static_cast<T>(0.f);
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                            sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+
+                                    sum = sd::math::nd4j_pow<T, T, T>(sum, static_cast<T>((T) 1.f) / extraParam0);
+
+                                    out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+            else {
+                nd4j_printf("ConvolutionUtils::pooling2d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
+                throw "";
+            }
+        }
+
+        void ConvolutionUtils::pooling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
+            BUILD_SINGLE_SELECTOR(input.dataType(), pooling2d_, (block, input, output, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling2dBP.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling2dBP.cpp
new file mode 100644
index 000000000..03f34bfae
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling2dBP.cpp
@@ -0,0 +1,306 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+        template <typename T>
+        static void pooling2dBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            // input [bS, iC, iH, iW]
+            // gradI [bS, iC, iH, iW] -> gradI is output in this function
+            // gradO [bS, iC, oH, oW]
+
+            // initial zeroing of gradI
+            gradI.nullify();
+
+            T* in = const_cast<NDArray&>(input).bufferAsT<T>();
+            T* gO = const_cast<NDArray&>(gradO).bufferAsT<T>();
+            T* gI = gradI.bufferAsT<T>();
+
+            const int kHEff = kH + (kH-1)*(dH-1);
+            const int kWEff = kW + (kW-1)*(dW-1);
+
+            const int bS = gradI.sizeAt(0);
+            const int iC = gradI.sizeAt(1);
+            const int iH = gradI.sizeAt(2);
+            const int iW = gradI.sizeAt(3);
+            const int oC = gradO.sizeAt(1);
+            const int oH = gradO.sizeAt(2);
+            const int oW = gradO.sizeAt(3);
+
+            nd4j_debug("MKL-DNN is not used for pooling2d_bp!\n", 0);
+
+            const Nd4jLong iStride0  = input.stridesOf()[0];
+            const Nd4jLong iStride1  = input.stridesOf()[1];
+            const Nd4jLong iStride2  = input.stridesOf()[2];
+            const Nd4jLong iStride3  = input.stridesOf()[3];
+            const Nd4jLong gIStride0 = gradI.stridesOf()[0];
+            const Nd4jLong gIStride1 = gradI.stridesOf()[1];
+            const Nd4jLong gIStride2 = gradI.stridesOf()[2];
+            const Nd4jLong gIStride3 = gradI.stridesOf()[3];
+            const Nd4jLong oStride0  = gradO.stridesOf()[0];
+            const Nd4jLong oStride1  = gradO.stridesOf()[1];
+            const Nd4jLong oStride2  = gradO.stridesOf()[2];
+            const Nd4jLong oStride3  = gradO.stridesOf()[3];
+            const Nd4jLong iStep2    = dH*iStride2;
+            const Nd4jLong iStep3    = dW*iStride3;
+            const Nd4jLong gIStep2   = dH*gIStride2;
+            const Nd4jLong gIStep3   = dW*gIStride3;
+            const int      kProd     = kH*kW;
+
+            const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3;
+
+            if(poolingMode == 0) {        // max
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart,hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
+
+                                    pIn = in + b * iStride0 + c * iStride1;
+
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
+
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) / dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) / dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) / dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+
+                                    sum = -DataTypeUtils::max<T>();
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
+
+                                    if (sameStrides) {
+
+                                        hstart *= iStride2;
+                                        hend *= iStride2;
+                                        wstart *= iStride3;
+                                        wend *= iStride3;
+
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
+                                                T valIn = pIn[kh + kw];
+                                                if (valIn > sum) {
+                                                    sum = valIn;
+                                                    maxKH = kh;
+                                                    maxKW = kw;
+                                                }
+                                            }
+                                        gI[pIn - in + maxKH + maxKW] += valO;
+                                    } else {
+
+                                        // we set these to default values
+                                        maxKH = hstart;
+                                        maxKW = wstart;
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                T valIn = pIn[kh * iStride2 + kw * iStride3];
+                                                if (valIn > sum) {
+                                                    sum = valIn;
+                                                    maxKH = kh;
+                                                    maxKW = kw;
+                                                }
+                                            }
+
+                                        gI[b * gIStride0 + c * gIStride1 + maxKH * gIStride2 + maxKW * gIStride3] += valO;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 1) {     // avg
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
+
+                                    pgI = gI + b * gIStride0 + c * gIStride1;
+
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
+
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) /
+                                                        dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) /
+                                                        dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) /
+                                                      dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) /
+                                                      dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+
+                                    hstart *= gIStride2;
+                                    hend *= gIStride2;
+                                    wstart *= gIStride3;
+                                    wend *= gIStride3;
+
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
+
+                                    if ((int) extraParam0 == 0)         //Exclude padding
+                                        valO /= static_cast<T>(sd::math::nd4j_ceil<double, T>(
+                                                static_cast<double>(hend - hstart) / static_cast<double>(gIStep2))) *
+                                                static_cast<T>(sd::math::nd4j_ceil<double, T>(
+                                                        static_cast<double>(wend - wstart) /
+                                                        static_cast<double>(gIStep3)));   //Accounts for dilation
+                                    else if ((int) extraParam0 == 1)    //Include padding
+                                        valO /= kProd;
+
+                                    for (Nd4jLong kh = hstart; kh < hend; kh += gIStep2)
+                                        for (Nd4jLong kw = wstart; kw < wend; kw += gIStep3)
+                                            pgI[kh + kw] += valO;
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 2) {  // pnorm
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong hstart, wstart, hend, wend, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int oh = 0; oh < oH; ++oh) {
+                                for (int ow = 0; ow < oW; ++ow) {
+
+                                    pIn = in + b * iStride0 + c * iStride1;
+                                    pgI = sameStrides ? gI + (pIn - in) : gI + b * gIStride0 + c * gIStride1;
+
+                                    hstart = oh * sH - pH;
+                                    wstart = ow * sW - pW;
+                                    hend = hstart + kHEff;
+                                    wend = wstart + kWEff;
+
+                                    if (hstart < 0)
+                                        hstart += dH * ((-hstart + dH - 1) /
+                                                        dH); // (Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-hstart) / static_cast<T>(dH));
+                                    if (wstart < 0)
+                                        wstart += dW * ((-wstart + dW - 1) /
+                                                        dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(-wstart) / static_cast<T>(dW));
+                                    if (hend > iH)
+                                        hend -= dH * ((hend - iH + dH - 1) /
+                                                      dH); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(hend-iH) / static_cast<T>(dH));
+                                    if (wend > iW)
+                                        wend -= dW * ((wend - iW + dW - 1) /
+                                                      dW); //(Nd4jLong)sd::math::nd4j_ceil<T,T>(static_cast<T>(wend-iW) / static_cast<T>(dW));
+
+                                    sum = static_cast<T>(0.f);
+                                    valO = gO[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3];
+
+                                    if (sameStrides) {
+
+                                        hstart *= iStride2;
+                                        hend *= iStride2;
+                                        wstart *= iStride3;
+                                        wend *= iStride3;
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                                sum += sd::math::nd4j_pow<T, T, T>(
+                                                        sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
+
+                                        valO *= sd::math::nd4j_pow<T, T, T>(sum,
+                                                                              ((T) 1. - extraParam0) / extraParam0);
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
+                                                pgI[kh + kw] += valO * sd::math::nd4j_pow<T, T, T>(
+                                                        sd::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0 - 1.f) *
+                                                                sd::math::nd4j_sgn<T, T>(pIn[kh + kw]);
+                                    } else {
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                sum += sd::math::nd4j_pow<T, T, T>(
+                                                        sd::math::nd4j_abs<T>(pIn[kh * iStride2 + kw * iStride3]),
+                                                        extraParam0);
+
+                                        valO *= sd::math::nd4j_pow<T, T, T>(sum,
+                                                                              ((T) 1. - extraParam0) / extraParam0);
+
+                                        for (Nd4jLong kh = hstart; kh < hend; kh += dH) {
+                                            for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                const auto inVal = pIn[kh * iStride2 + kw * iStride3];
+                                                pgI[kh * gIStride2 + kw * gIStride3] += valO *
+                                                                                        sd::math::nd4j_pow<T, T, T>(
+                                                                                                sd::math::nd4j_abs<T>(
+                                                                                                        inVal),
+                                                                                                extraParam0 - 1.f) *
+                                                                                        sd::math::nd4j_sgn<T, T>(
+                                                                                                inVal);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+            else {
+                nd4j_printf("ConvolutionUtils::pooling2dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
+                throw std::runtime_error("Incorrect pooling2dBP mode");
+            }
+        }
+
+void ConvolutionUtils::pooling2dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            BUILD_SINGLE_SELECTOR(input.dataType(), pooling2dBP_, (block, input, gradO, gradI, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling3d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling3d.cpp
new file mode 100644
index 000000000..04d5f993a
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling3d.cpp
@@ -0,0 +1,261 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+        template <typename T>
+        static void pooling3d_(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            // input is  [bS, iC, iD, iH, iW]
+            // output is [bS, iC, oD, oH, oW]
+            T* out = output.bufferAsT<T>();
+            T* in  = const_cast<NDArray&>(input).bufferAsT<T>();
+
+            const int kDEff = kD + (kD-1)*(dD-1);
+            const int kHEff = kH + (kH-1)*(dH-1);
+            const int kWEff = kW + (kW-1)*(dW-1);
+
+            const int bS = input.sizeAt(0);
+            const int iC = input.sizeAt(1);
+            const int iD = input.sizeAt(2);
+            const int iH = input.sizeAt(3);
+            const int iW = input.sizeAt(4);
+            const int oC = output.sizeAt(1);
+            const int oD = output.sizeAt(2);
+            const int oH = output.sizeAt(3);
+            const int oW = output.sizeAt(4);
+
+            nd4j_debug("MKL-DNN is not used for pooling3d!\n", 0);
+
+            const Nd4jLong iStride0 = input.stridesOf()[0];
+            const Nd4jLong iStride1 = input.stridesOf()[1];
+            const Nd4jLong iStride2 = input.stridesOf()[2];
+            const Nd4jLong iStride3 = input.stridesOf()[3];
+            const Nd4jLong iStride4 = input.stridesOf()[4];
+            const Nd4jLong oStride0 = output.stridesOf()[0];
+            const Nd4jLong oStride1 = output.stridesOf()[1];
+            const Nd4jLong oStride2 = output.stridesOf()[2];
+            const Nd4jLong oStride3 = output.stridesOf()[3];
+            const Nd4jLong oStride4 = output.stridesOf()[4];
+            const Nd4jLong iStep2   = dD*iStride2;
+            const Nd4jLong iStep3   = dH*iStride3;
+            const Nd4jLong iStep4   = dW*iStride4;
+            const int kProd         = kD*kH*kW;
+
+            if(poolingMode == 0) {        // max
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
+
+                                        pIn = in + b * iStride0 + c * iStride1;
+
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
+
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
+
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
+
+                                        sum = -DataTypeUtils::max<T>();
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                    T val = pIn[kd + kh + kw];
+                                                    if (val > sum)
+                                                        sum = val;
+                                                }
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 1) {     // avg
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
+
+                                        pIn = in + b * iStride0 + c * iStride1;
+
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
+
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
+
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
+
+                                        sum = static_cast<T>(0.);
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += pIn[kd + kh + kw];
+
+                                        if (extraParam0 == 0)         //Exclude padding
+                                            sum /= sd::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(iStep2)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(iStep3)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(iStep4));   //Accounts for dilation
+                                        else if (extraParam0 == 1)    //Include padding
+                                            sum /= kProd;
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 2) {  // pnorm
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend;
+                    T sum, *pIn;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int od = start_z; od < stop_z; od += inc_z) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
+
+                                        pIn = in + b * iStride0 + c * iStride1;
+
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
+
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
+
+                                        dstart *= iStride2;
+                                        dend *= iStride2;
+                                        hstart *= iStride3;
+                                        hend *= iStride3;
+                                        wstart *= iStride4;
+                                        wend *= iStride4;
+
+                                        sum = static_cast<T>(0.);
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                    sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+
+                                        sum = sd::math::nd4j_pow<T, T, T>(sum, (T) 1.f / extraParam0);
+
+                                        out[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4] = sum;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
+            }
+            else {
+                nd4j_printf("ConvolutionUtils::pooling3d: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
+                throw std::runtime_error("Incorrect poooling3d mode");
+            }
+        }
+
+void ConvolutionUtils::pooling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            BUILD_SINGLE_SELECTOR(input.dataType(), pooling3d_, (block, input, output, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling3dBP.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling3dBP.cpp
new file mode 100644
index 000000000..02f6f57ac
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_pooling3dBP.cpp
@@ -0,0 +1,326 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+        template <typename T>
+        static void pooling3dBP_(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            // input [bS, iC, iD, iH, iW]
+            // gradI [bS, iC, iD, iH, iW] -> gradI is output in this function
+            // gradO [bS, iC, oD, oH, oW]
+
+            // initial zeroing of gradI
+            gradI.nullify();
+
+            T* in = const_cast<NDArray&>(input).bufferAsT<T>();
+            T* gO = const_cast<NDArray&>(gradO).bufferAsT<T>();
+            T* gI = gradI.bufferAsT<T>();
+
+            const int kDEff = kD + (kD-1)*(dD-1);
+            const int kHEff = kH + (kH-1)*(dH-1);
+            const int kWEff = kW + (kW-1)*(dW-1);
+
+            const int bS = gradI.sizeAt(0);
+            const int iC = gradI.sizeAt(1);
+            const int iD = gradI.sizeAt(2);
+            const int iH = gradI.sizeAt(3);
+            const int iW = gradI.sizeAt(4);
+            const int oC = gradO.sizeAt(1);
+            const int oD = gradO.sizeAt(2);
+            const int oH = gradO.sizeAt(3);
+            const int oW = gradO.sizeAt(4);
+
+            nd4j_debug("MKL-DNN is not used for pooling3d_bp!\n", 0);
+
+            const Nd4jLong iStride0  = input.stridesOf()[0];
+            const Nd4jLong iStride1  = input.stridesOf()[1];
+            const Nd4jLong iStride2  = input.stridesOf()[2];
+            const Nd4jLong iStride3  = input.stridesOf()[3];
+            const Nd4jLong iStride4  = input.stridesOf()[4];
+            const Nd4jLong gIStride0 = gradI.stridesOf()[0];
+            const Nd4jLong gIStride1 = gradI.stridesOf()[1];
+            const Nd4jLong gIStride2 = gradI.stridesOf()[2];
+            const Nd4jLong gIStride3 = gradI.stridesOf()[3];
+            const Nd4jLong gIStride4 = gradI.stridesOf()[4];
+            const Nd4jLong oStride0 = gradO.stridesOf()[0];
+            const Nd4jLong oStride1 = gradO.stridesOf()[1];
+            const Nd4jLong oStride2 = gradO.stridesOf()[2];
+            const Nd4jLong oStride3 = gradO.stridesOf()[3];
+            const Nd4jLong oStride4 = gradO.stridesOf()[4];
+            const Nd4jLong iStep2   = dD*iStride2;
+            const Nd4jLong iStep3   = dH*iStride3;
+            const Nd4jLong iStep4   = dW*iStride4;
+            const Nd4jLong gIStep2  = dD*gIStride2;
+            const Nd4jLong gIStep3  = dH*gIStride3;
+            const Nd4jLong gIStep4  = dW*gIStride4;
+            const int      kProd    = kD*kH*kW;
+
+            const bool sameStrides = iStride0 == gIStride0 && iStride1 == gIStride1 && iStride2 == gIStride2 && iStride3 == gIStride3 && iStride4 == gIStride4;
+
+            if(poolingMode == 0) {        // max
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
+
+                    for (int b = start_x; b < stop_x; b++) {
+                        for (int c = start_y; c < stop_y; c++) {
+                            for (int od = 0; od < oD; od++) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
+
+                                        pIn = in + b * iStride0 + c * iStride1;
+
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
+
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
+
+                                        sum = -DataTypeUtils::max<T>();
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
+
+                                        if (sameStrides) {
+
+                                            dstart *= iStride2;
+                                            dend *= iStride2;
+                                            hstart *= iStride3;
+                                            hend *= iStride3;
+                                            wstart *= iStride4;
+                                            wend *= iStride4;
+
+                                            maxKD = dstart;
+                                            maxKH = hstart;
+                                            maxKW = wstart;
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4) {
+                                                        T valIn = pIn[kd + kh + kw];
+                                                        if (valIn > sum) {
+                                                            sum = valIn;
+                                                            maxKD = kd;
+                                                            maxKH = kh;
+                                                            maxKW = kw;
+                                                        }
+                                                    }
+                                            gI[pIn - in + maxKD + maxKH + maxKW] += valO;
+                                        } else {
+
+                                            // we set these to default values
+                                            maxKH = hstart;
+                                            maxKW = wstart;
+                                            maxKD = dstart;
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                        T valIn = pIn[kd * iStride2 + kh * iStride3 + kw * iStride4];
+                                                        if (valIn > sum) {
+                                                            sum = valIn;
+                                                            maxKD = kd;
+                                                            maxKH = kh;
+                                                            maxKW = kw;
+                                                        }
+                                                    }
+
+                                            gI[b * gIStride0 + c * gIStride1 + maxKD * gIStride2 + maxKH * gIStride3 + maxKW * gIStride4] += valO;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 1) {     // avg
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
+
+                    for (int b = start_x; b < stop_x; b++) {
+                        for (int c = start_y; c < stop_y; c++) {
+                            for (int od = 0; od < oD; od++) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
+
+                                        pgI = gI + b * gIStride0 + c * gIStride1;
+
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
+
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
+
+                                        dstart *= gIStride2;
+                                        dend *= gIStride2;
+                                        hstart *= gIStride3;
+                                        hend *= gIStride3;
+                                        wstart *= gIStride4;
+                                        wend *= gIStride4;
+
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
+
+                                        if (extraParam0 == 0)         //Exclude padding
+                                            valO /= sd::math::nd4j_ceil<double, T>(static_cast<double>(dend - dstart) / static_cast<double>(gIStep2)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(hend - hstart) / static_cast<double>(gIStep3)) * sd::math::nd4j_ceil<double, T>(static_cast<double>(wend - wstart) / static_cast<double>(gIStep4));   //Accounts for dilation
+                                        else if (extraParam0 == 1)    //Include padding
+                                            valO /= kProd;
+
+                                        for (Nd4jLong kd = dstart; kd < dend; kd += gIStep2)
+                                            for (Nd4jLong kh = hstart; kh < hend; kh += gIStep3)
+                                                for (Nd4jLong kw = wstart; kw < wend; kw += gIStep4)
+                                                    pgI[kd + kh + kw] += valO;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+/*************************************************************************/
+            else if(poolingMode == 2) {  // pnorm
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    Nd4jLong dstart, hstart, wstart, dend, hend, wend, maxKD, maxKH, maxKW;
+                    T sum, valO, *pIn, *pgI;
+
+                    for (int b = start_x; b < stop_x; b++) {
+                        for (int c = start_y; c < stop_y; c++) {
+                            for (int od = 0; od < oD; od++) {
+                                for (int oh = 0; oh < oH; ++oh) {
+                                    for (int ow = 0; ow < oW; ++ow) {
+
+                                        pIn = in + b * iStride0 + c * iStride1;
+                                        pgI = gI + (pIn - in);
+
+                                        dstart = od * sD - pD;
+                                        hstart = oh * sH - pH;
+                                        wstart = ow * sW - pW;
+                                        dend = dstart + kDEff;
+                                        hend = hstart + kHEff;
+                                        wend = wstart + kWEff;
+
+                                        if (dstart < 0)
+                                            dstart += dD * ((-dstart + dD - 1) / dD);
+                                        if (hstart < 0)
+                                            hstart += dH * ((-hstart + dH - 1) / dH);
+                                        if (wstart < 0)
+                                            wstart += dW * ((-wstart + dW - 1) / dW);
+                                        if (dend > iD)
+                                            dend -= dD * ((dend - iD + dD - 1) / dD);
+                                        if (hend > iH)
+                                            hend -= dH * ((hend - iH + dH - 1) / dH);
+                                        if (wend > iW)
+                                            wend -= dW * ((wend - iW + dW - 1) / dW);
+
+                                        sum = static_cast<T>(0.);
+                                        valO = gO[b * oStride0 + c * oStride1 + od * oStride2 + oh * oStride3 + ow * oStride4];
+
+                                        if (sameStrides) {
+
+                                            dstart *= iStride2;
+                                            dend *= iStride2;
+                                            hstart *= iStride3;
+                                            hend *= iStride3;
+                                            wstart *= iStride4;
+                                            wend *= iStride4;
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                        sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]), extraParam0);
+
+                                            valO *= sd::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += iStep2)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += iStep3)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += iStep4)
+                                                        pgI[kd + kh + kw] += valO * sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd + kh + kw]),extraParam0 - (T) 1.f) * sd::math::nd4j_sgn<T, T>(pIn[kd + kh + kw]);
+                                        } else {
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW)
+                                                        sum += sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(pIn[kd * iStride2 + kh * iStride3 + kw * iStride4]), extraParam0);
+
+                                            valO *= sd::math::nd4j_pow<T, T, T>(sum, ((T) 1.f - extraParam0) / extraParam0);
+
+                                            for (Nd4jLong kd = dstart; kd < dend; kd += dD)
+                                                for (Nd4jLong kh = hstart; kh < hend; kh += dH)
+                                                    for (Nd4jLong kw = wstart; kw < wend; kw += dW) {
+                                                        const auto inVal = pIn[kD * iStride2 + kh * iStride3 + kw * iStride4];
+                                                        pgI[kd * gIStride2 + kh * gIStride3 + kw * gIStride4] += valO * sd::math::nd4j_pow<T, T, T>(sd::math::nd4j_abs<T>(inVal), extraParam0 - 1.f) * sd::math::nd4j_sgn<T, T>(inVal);
+                                                    }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1);
+            }
+            else {
+                nd4j_printf("ConvolutionUtils::pooling3dBP: pooling mode argument can take three values only: 0, 1, 2, but got %i instead !\n", poolingMode);
+                throw "";
+            }
+        }
+
+       void ConvolutionUtils::pooling3dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+            BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dBP_, (block, input, gradO, gradI, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+        }
+    }
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_sconv2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_sconv2d.cpp
new file mode 100644
index 000000000..742f88c3b
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_sconv2d.cpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void sconv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+            // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
+            // weightsDepth  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+            // weightsPoint  [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
+            // bias          [oC], oC = iC*mC if weightsPoint=nullptr
+            // output is     [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)
+
+            //  kH         filter(kernel) height
+            //  kW         filter(kernel) width
+            //  sH         strides height
+            //  sW         strides width
+            //  pH         paddings height
+            //  pW         paddings width
+            //  dH         dilations height
+            //  dW         dilations width
+            //  paddingMode 0-VALID, 1-SAME
+            //  isNCHW      1-NCHW,  0-NHWC
+
+            int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
+            int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+            ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+            mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier
+
+            NDArray* outputDepth = output;
+            if(weightsPoint)                        // if pointwise convolution is expected
+                outputDepth = new NDArray(output->ordering(), !isNCHW ? std::vector<Nd4jLong>({bS, oH, oW, iC*mC}) : std::vector<Nd4jLong>({bS, iC*mC, oH, oW}), input->dataType(), input->getContext());
+
+            // ----- perform depthwise convolution (if weightsPoint is absent then oC = iC*mC) ----- //
+            ConvolutionUtils::depthwiseConv2d(block, input, weightsDepth, weightsPoint ? nullptr : bias, outputDepth, kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, isNCHW, wFormat);
+
+            // ----- perform pointwise convolution (oH = iH, oW = iW) ----- //
+            if (weightsPoint) {
+                ConvolutionUtils::conv2d(block, outputDepth, weightsPoint, bias, output, 1,1, 1,1, 0,0, 1,1, paddingMode, isNCHW, wFormat);             // in this case oH=iH, oW=iW
+                delete outputDepth;
+            }
+        }
+
+void ConvolutionUtils::sconv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling2d.cpp
new file mode 100644
index 000000000..ffdd5c34b
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling2d.cpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling2d_(const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
+            // input  has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
+            // output has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
+
+            const T* x = input.bufferAsT<T>();
+                  T* z = output.bufferAsT<T>();
+
+            const uint dimIH = isNCHW ? 2 : 1;
+            const uint dimIC = isNCHW ? 1 : 3;
+
+            const uint bS = input.sizeAt(0);
+            const uint iC = input.sizeAt(dimIC);
+            const uint oH = output.sizeAt(dimIH);
+            const uint oW = output.sizeAt(dimIH + 1);
+
+            const Nd4jLong xStride0 = input.stridesOf()[0];
+            const Nd4jLong xStride1 = input.stridesOf()[dimIC];
+            const Nd4jLong xStride2 = input.stridesOf()[dimIH];
+            const Nd4jLong xStride3 = input.stridesOf()[dimIH + 1];
+
+            const Nd4jLong zStride0 = output.stridesOf()[0];
+            const Nd4jLong zStride1 = output.stridesOf()[dimIC];
+            const Nd4jLong zStride2 = output.stridesOf()[dimIH];
+            const Nd4jLong zStride3 = output.stridesOf()[dimIH + 1];
+
+            // loop through output array
+            auto func = PRAGMA_THREADS_FOR_3D {
+                uint xCoord2, xCoord3;
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint h = start_z; h < stop_z; h += inc_z) {
+                            for (uint w = 0; w < oW; ++w) {
+                                xCoord2 = h / factorH;
+                                xCoord3 = w / factorW;
+
+                                z[b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3] = x[b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3];
+                            }
+                        }
+                    }
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oH, 1);
+        }
+
+
+void ConvolutionUtils::upsampling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
+     BUILD_SINGLE_SELECTOR(input.dataType(), upsampling2d_, (input, output, factorH, factorW, isNCHW), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling2dBP.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling2dBP.cpp
new file mode 100644
index 000000000..aba46aabc
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling2dBP.cpp
@@ -0,0 +1,86 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling2dBP_(const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+            // gradO has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
+            // gradI has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
+
+            const T* x = gradO.bufferAsT<T>();
+                  T* z = gradI.bufferAsT<T>();
+
+            const uint dimIH = isNCHW ? 2 : 1;
+            const uint dimIC = isNCHW ? 1 : 3;
+
+            const uint bS = gradI.sizeAt(0);
+            const uint iC = gradI.sizeAt(dimIC);
+            const uint iH = gradI.sizeAt(dimIH);
+            const uint iW = gradI.sizeAt(dimIH + 1);
+
+            const uint factorH = gradO.sizeAt(dimIH)     / iH;
+            const uint factorW = gradO.sizeAt(dimIH + 1) / iW;
+
+            const Nd4jLong xStride0 = gradO.stridesOf()[0];
+            const Nd4jLong xStride1 = gradO.stridesOf()[dimIC];
+            const Nd4jLong xStride2 = gradO.stridesOf()[dimIH];
+            const Nd4jLong xStride3 = gradO.stridesOf()[dimIH + 1];
+
+            const Nd4jLong zStride0 = gradI.stridesOf()[0];
+            const Nd4jLong zStride1 = gradI.stridesOf()[dimIC];
+            const Nd4jLong zStride2 = gradI.stridesOf()[dimIH];
+            const Nd4jLong zStride3 = gradI.stridesOf()[dimIH + 1];
+
+            // loop through output array
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint h = start_z; h < stop_z; h += inc_z) {
+                            for (uint w = 0; w < iW; ++w) {
+
+                                const auto zOffset = b * zStride0 + c * zStride1 + h * zStride2 + w * zStride3;
+
+                                z[zOffset] = 0;
+
+                                for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
+                                    for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
+                                        z[zOffset] += x[b * xStride0 + c * xStride1 + xh * xStride2 + xw * xStride3];
+                            }
+                        }
+                    }
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iH, 1);
+        }
+
+void ConvolutionUtils::upsampling2dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+            BUILD_SINGLE_SELECTOR(gradO.dataType(), upsampling2dBP_, (gradO, gradI, isNCHW), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling3d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling3d.cpp
new file mode 100644
index 000000000..7b86ec5a1
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling3d.cpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling3d_(const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+            // input  has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
+            // output has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
+
+            const T* x = input.bufferAsT<T>();
+                  T* z = output.bufferAsT<T>();
+
+            const uint dimID = isNCDHW ? 2 : 1;
+            const uint dimIC = isNCDHW ? 1 : 4;
+
+            const uint bS = input.sizeAt(0);
+            const uint iC = input.sizeAt(dimIC);
+            const uint oD = output.sizeAt(dimID);
+            const uint oH = output.sizeAt(dimID + 1);
+            const uint oW = output.sizeAt(dimID + 2);
+
+            const Nd4jLong xStride0 = input.stridesOf()[0];
+            const Nd4jLong xStride1 = input.stridesOf()[dimIC];
+            const Nd4jLong xStride2 = input.stridesOf()[dimID];
+            const Nd4jLong xStride3 = input.stridesOf()[dimID + 1];
+            const Nd4jLong xStride4 = input.stridesOf()[dimID + 2];
+
+            const Nd4jLong zStride0 = output.stridesOf()[0];
+            const Nd4jLong zStride1 = output.stridesOf()[dimIC];
+            const Nd4jLong zStride2 = output.stridesOf()[dimID];
+            const Nd4jLong zStride3 = output.stridesOf()[dimID + 1];
+            const Nd4jLong zStride4 = output.stridesOf()[dimID + 2];
+
+            // loop through output array
+            auto func = PRAGMA_THREADS_FOR_3D {
+                uint xCoord2, xCoord3, xCoord4;
+
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint d = start_z; d < stop_z; d += inc_z) {
+                            for (uint h = 0; h < oH; ++h) {
+                                for (uint w = 0; w < oW; ++w) {
+
+                                    xCoord2 = d / factorD;
+                                    xCoord3 = h / factorH;
+                                    xCoord4 = w / factorW;
+
+                                    z[b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4] = x[
+                                            b * xStride0 + c * xStride1 + xCoord2 * xStride2 + xCoord3 * xStride3 +
+                                            xCoord4 * xStride4];
+                                }
+                            }
+                        }
+                    }
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, oD, 1);
+        }
+
+       void ConvolutionUtils::upsampling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+            BUILD_SINGLE_SELECTOR(input.dataType(), upsampling3d_, (input, output, factorD, factorH, factorW, isNCDHW), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling3dBP.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling3dBP.cpp
new file mode 100644
index 000000000..93c2746fb
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_upsampling3dBP.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling3dBP_(const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
+
+            // input  has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
+            // output has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
+
+            const T* x = gradO.bufferAsT<T>();
+                  T* z = gradI.bufferAsT<T>();
+
+            const uint dimID = isNCDHW ? 2 : 1;
+            const uint dimIC = isNCDHW ? 1 : 4;
+
+            const uint bS = gradI.sizeAt(0);
+            const uint iC = gradI.sizeAt(dimIC);
+            const uint iD = gradI.sizeAt(dimID);
+            const uint iH = gradI.sizeAt(dimID + 1);
+            const uint iW = gradI.sizeAt(dimID + 2);
+
+            const uint factorD = gradO.sizeAt(dimID)     / iD;
+            const uint factorH = gradO.sizeAt(dimID + 1) / iH;
+            const uint factorW = gradO.sizeAt(dimID + 2) / iW;
+
+            const Nd4jLong xStride0 = gradO.stridesOf()[0];
+            const Nd4jLong xStride1 = gradO.stridesOf()[dimIC];
+            const Nd4jLong xStride2 = gradO.stridesOf()[dimID];
+            const Nd4jLong xStride3 = gradO.stridesOf()[dimID + 1];
+            const Nd4jLong xStride4 = gradO.stridesOf()[dimID + 2];
+
+            const Nd4jLong zStride0 = gradI.stridesOf()[0];
+            const Nd4jLong zStride1 = gradI.stridesOf()[dimIC];
+            const Nd4jLong zStride2 = gradI.stridesOf()[dimID];
+            const Nd4jLong zStride3 = gradI.stridesOf()[dimID + 1];
+            const Nd4jLong zStride4 = gradI.stridesOf()[dimID + 2];
+
+            // loop through output array
+            auto func = PRAGMA_THREADS_FOR_3D {
+                for (uint b = start_x; b < stop_x; b += inc_x) {
+                    for (uint c = start_y; c < stop_y; c += inc_y) {
+                        for (uint d = start_z; d < stop_z; d += inc_z) {
+                            for (uint h = 0; h < iH; ++h) {
+                                for (uint w = 0; w < iW; ++w) {
+
+                                    const auto zOffset = b * zStride0 + c * zStride1 + d * zStride2 + h * zStride3 + w * zStride4;
+
+                                    z[zOffset] = 0;
+
+                                    for (uint xd = d * factorD; xd < d * factorD + factorD; ++xd)
+                                        for (uint xh = h * factorH; xh < h * factorH + factorH; ++xh)
+                                            for (uint xw = w * factorW; xw < w * factorW + factorW; ++xw)
+                                                z[zOffset] += x[b * xStride0 + c * xStride1 + xd * xStride2 + xh * xStride3 + xw * xStride4];
+                                }
+                            }
+                        }
+                    }
+                }
+            };
+
+            samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, iD, 1);
+        }
+
+        
+        void ConvolutionUtils::upsampling3dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+            BUILD_SINGLE_SELECTOR(gradO.dataType(), upsampling3dBP_, (gradO, gradI, isNCHW), FLOAT_TYPES);
+        }
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions_vol2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_vol2col.cpp
new file mode 100644
index 000000000..552dceb6a
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions_vol2col.cpp
@@ -0,0 +1,147 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 18.09.2018
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <execution/Threads.h>
+
+namespace sd {
+    namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+// [bS, iC, iD, iH, iW] is convoluted to [bS, iC, kD, kH, kW, oD, oH, oW]
+template <typename T>
+static void vol2col_(const NDArray& volume, NDArray& columns, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+            const int bS = volume.sizeAt(0);
+            const int iC = volume.sizeAt(1);
+            const int iD = volume.sizeAt(2);
+            const int iH = volume.sizeAt(3);
+            const int iW = volume.sizeAt(4);
+            const int kD = columns.sizeAt(2);
+            const int kH = columns.sizeAt(3);
+            const int kW = columns.sizeAt(4);
+            const int oD = columns.sizeAt(5);
+            const int oH = columns.sizeAt(6);
+            const int oW = columns.sizeAt(7);
+            const Nd4jLong colStride0 = columns.stridesOf()[0];
+            const Nd4jLong colStride1 = columns.stridesOf()[1];
+            const Nd4jLong colStride2 = columns.stridesOf()[2];
+            const Nd4jLong colStride3 = columns.stridesOf()[3];
+            const Nd4jLong colStride4 = columns.stridesOf()[4];
+            const Nd4jLong colStride5 = columns.stridesOf()[5];
+            const Nd4jLong colStride6 = columns.stridesOf()[6];
+            const Nd4jLong colStride7 = columns.stridesOf()[7];
+            const Nd4jLong volStride0 = volume.stridesOf()[0];
+            const Nd4jLong volStride1 = volume.stridesOf()[1];
+            const Nd4jLong volStride2 = volume.stridesOf()[2];
+            const Nd4jLong volStride3 = volume.stridesOf()[3];
+            const Nd4jLong volStride4 = volume.stridesOf()[4];
+
+            T* colBuff = columns.bufferAsT<T>();
+            T* volBuff = const_cast<NDArray&>(volume).bufferAsT<T>();
+
+
+            if (volume.ordering() == 'c' &&  columns.ordering() == 'c' && shape::strideDescendingCAscendingF(volume.getShapeInfo()) && shape::strideDescendingCAscendingF(columns.getShapeInfo())) {
+
+                auto func = PRAGMA_THREADS_FOR_3D {
+                    T *col, *vol;
+                    int volDep, volRow, volCol;
+
+                    for (int b = start_x; b < stop_x; b += inc_x) {
+                        for (int c = start_y; c < stop_y; c += inc_y) {
+                            for (int kDep = start_z; kDep < stop_z; kDep += inc_z) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+                                        for (int colD = 0; colD < oD; ++colD) {
+                                            for (int colH = 0; colH < oH; ++colH) {
+                                                for (int colW = 0; colW < oW; ++colW) {
+
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
+
+                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+
+                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
+                                                        *col = static_cast<T>(0.);
+                                                    else {
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *col = *vol;
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, iC, 1, 0, kD, 1);
+
+            } else {
+
+                auto func = PRAGMA_THREADS_FOR_2D {
+                    T *col, *vol;
+                    int volDep, volRow, volCol;
+                    for (int b = start_x; b < stop_x; b++) {
+                        for (int colD = start_y; colD < stop_y; colD++) {
+                            for (int colH = 0; colH < oH; ++colH) {
+                                for (int colW = 0; colW < oW; ++colW) {
+                                    for (int c = 0; c < iC; ++c) {
+                                        for (int kDep = 0; kDep < kD; ++kDep) {
+                                            for (int kRow = 0; kRow < kH; ++kRow) {
+                                                for (int kCol = 0; kCol < kW; ++kCol) {
+
+                                                    volDep = (-pD + kDep * dD) + colD * sD;
+                                                    volRow = (-pH + kRow * dH) + colH * sH;
+                                                    volCol = (-pW + kCol * dW) + colW * sW;
+
+                                                    col = colBuff + b * colStride0 + c * colStride1 + kDep * colStride2 + kRow * colStride3 + kCol * colStride4 + colD * colStride5 + colH * colStride6 + colW * colStride7;
+
+                                                    if (static_cast<unsigned>(volDep) >= static_cast<unsigned>(iD) || static_cast<unsigned>(volRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(volCol) >= static_cast<unsigned>(iW))
+                                                        *col = static_cast<T>(0.f);
+                                                    else {
+                                                        vol = volBuff + b * volStride0 + c * volStride1 + volDep * volStride2 + volRow * volStride3 + volCol * volStride4;
+                                                        *col = *vol;
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                };
+
+                samediff::Threads::parallel_for(func, 0, bS, 1, 0, oD, 1);
+                //func(0, 0, bS, 1, 0, oD, 1);
+            }
+        }
+
+void ConvolutionUtils::vol2col(sd::graph::Context& block, const NDArray& volume, NDArray& columns, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+     BUILD_SINGLE_SELECTOR(volume.dataType(), vol2col_, (volume, columns, sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
+}
+
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
deleted file mode 100644
index 47da861ed..000000000
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ /dev/null
@@ -1,1670 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2015-2018 Skymind, Inc.
- * Copyright (c) 2019 Konduit K.K.
- *
- * This program and the accompanying materials are made available under the
- * terms of the Apache License, Version 2.0 which is available at
- * https://www.apache.org/licenses/LICENSE-2.0.
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- * SPDX-License-Identifier: Apache-2.0
- ******************************************************************************/
-
-//
-// @author Yurii Shyrma (iuriish@yahoo.com)
-//
-
-#include <ops/declarable/helpers/convolutions.h>
-#include <ops/declarable/helpers/im2col.h>
-#include <ops/declarable/helpers/col2im.h>
-#include<ops/declarable/helpers/addBias.h>
-#include <exceptions/cuda_exception.h>
-#include <array/NDArrayFactory.h>
-#include <helpers/MmulHelper.h>
-#include <helpers/PointersManager.h>
-#include <math/templatemath.h>
-
-namespace sd {
-namespace ops  {
-
-//////////////////////////////////////////////////////////////////////////
-// vol [bS, iC, iD, iH, iW] is convoluted to col [bS, iC, kD, kH, kW, oD, oH, oW]
-template <typename T>
-static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeInfo, void* columns, const Nd4jLong* colShapeInfo,  const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-    const T* vol = reinterpret_cast<const T*>(volume);
-          T* col = reinterpret_cast<T*>(columns);
-
-    __shared__ int colRank, volRank;
-    __shared__ Nd4jLong colLen, iD, iH, iW, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        volRank = 5;
-        colRank = 8;
-
-        colLen = shape::length(colShapeInfo);
-
-        iD = volShapeInfo[3];
-        iH = volShapeInfo[4];
-        iW = volShapeInfo[5];
-    }
-    __syncthreads();
-
-    const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(colInd >= colLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * colRank;
-
-    shape::index2coords(colInd, colShapeInfo, coords);
-
-    // const auto colW = coords[7];
-    // const auto colH = coords[6];
-    // const auto colD = coords[5];
-    // const auto kCol = coords[4];
-    // const auto kRow = coords[3];
-    // const auto kDep = coords[2];
-    // const auto c    = coords[1];
-    // const auto b    = coords[0];
-
-    const auto colOffset = shape::getOffset(colShapeInfo, coords);
-
-    coords[2] = -pD + coords[2] * dD + coords[5] * sD;     // const auto volDep = (-pD + kDep * dD) + colD * sD;
-    coords[3] = -pH + coords[3] * dH + coords[6] * sH;     // const auto volRow = (-pH + kRow * dH) + colH * sH;
-    coords[4] = -pW + coords[4] * dW + coords[7] * sW;     // const auto volCol = (-pW + kCol * dW) + colW * sW;
-
-    if (static_cast<unsigned>(coords[2]) >= static_cast<unsigned>(iD) || static_cast<unsigned>(coords[3]) >= static_cast<unsigned>(iH) || static_cast<unsigned>(coords[4]) >= static_cast<unsigned>(iW))
-        col[colOffset] = static_cast<T>(0.);
-    else
-        col[colOffset] = vol[shape::getOffset(volShapeInfo, coords)];
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void vol2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                const void* volume, const Nd4jLong* volShapeInfo,
-                                      void* columns, const Nd4jLong* colShapeInfo,
-                                const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-    vol2colCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(volume, volShapeInfo, columns, colShapeInfo,  sD, sH, sW, pD, pH, pW, dD, dH, dW);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::vol2col(sd::graph::Context& block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-    PointersManager manager(block.launchContext(), "vol2col");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 4;
-    const int blocksPerGrid = (col.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = col.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
-
-    NDArray::prepareSpecialUse({&col}, {&vol});
-    BUILD_SINGLE_SELECTOR(vol.dataType(), vol2colCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), vol.getSpecialBuffer(), vol.getSpecialShapeInfo(), col.specialBuffer(), col.specialShapeInfo(), sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&col}, {&vol});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// columns [bS, iC, kD, kH, kW, oD, oH, oW] to be de-convoluted to volume [bS, iC, iD, iH, iW]
-template <typename T>
-static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShapeInfo, void* volume, const Nd4jLong* volShapeInfo,  const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-    const T* col = reinterpret_cast<const T*>(columns);
-          T* vol = reinterpret_cast<T*>(volume);
-
-    __shared__ uint kD, kH, kW, oD, oH, oW, *sharedMem;
-    __shared__ Nd4jLong volLen;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<uint*>(shmem);
-
-        oD = colShapeInfo[6];
-        oH = colShapeInfo[7];
-        oW = colShapeInfo[8];
-
-        kD = dD * (colShapeInfo[3] - 1) + 1;
-        kH = dH * (colShapeInfo[4] - 1) + 1;
-        kW = dW * (colShapeInfo[5] - 1) + 1;
-
-        volLen  = shape::length(volShapeInfo);
-    }
-    __syncthreads();
-
-    auto coords = sharedMem + threadIdx.x * 8;
-
-    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (Nd4jLong i = tid; i < volLen; i += gridDim.x * blockDim.x) {
-
-        shape::index2coords(i, volShapeInfo, coords);
-
-        const auto volOffset = shape::getOffset(volShapeInfo, coords);
-
-        const auto bSiCoffset = coords[0] * colShapeInfo[9] + coords[1] * colShapeInfo[10];
-
-        const uint imD = coords[2] + pD;
-        const uint imH = coords[3] + pH;
-        const uint imW = coords[4] + pW;
-
-        const uint colDstart = (imD < kD) ? 0 : (imD - kD) / sD + 1;
-        const uint colHstart = (imH < kH) ? 0 : (imH - kH) / sH + 1;
-        const uint colWstart = (imW < kW) ? 0 : (imW - kW) / sW + 1;
-
-        const uint colDend = sd::math::nd4j_min<uint>(imD / sD + 1, oD);
-        const uint colHend = sd::math::nd4j_min<uint>(imH / sH + 1, oH);
-        const uint colWend = sd::math::nd4j_min<uint>(imW / sW + 1, oW);
-
-        T val = 0;
-
-        for(uint colD = colDstart; colD < colDend; ++colD) {
-            coords[2] = imD - colD * sD;
-            if(coords[2] % dD != 0) continue;
-
-            for(uint colH = colHstart; colH < colHend; ++colH) {
-                coords[3] = imH - colH * sH;
-                if(coords[3] % dH != 0) continue;
-
-                for(uint colW = colWstart; colW < colWend; ++colW) {
-                    coords[4] = imW - colW * sW;
-                    if(coords[4] % dW != 0) continue;
-
-                    val += col[bSiCoffset + (coords[2]/dD)*colShapeInfo[11] + (coords[3]/dH)*colShapeInfo[12] + (coords[4]/dW)*colShapeInfo[13] + colD*colShapeInfo[14] + colH*colShapeInfo[15] + colW*colShapeInfo[16]];
-
-                }
-            }
-        }
-
-        vol[volOffset] = val;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void col2volCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                const void* columns, const Nd4jLong* colShapeInfo,
-                                      void* volume, const Nd4jLong* volShapeInfo,
-                                const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-    col2volCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(columns, colShapeInfo, volume, volShapeInfo, sD, sH, sW, pD, pH, pW, dD, dH, dW);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::col2vol(sd::graph::Context& block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
-
-    PointersManager manager(block.launchContext(), "col2vol");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 4;
-    const int blocksPerGrid = (vol.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = col.rankOf() * sizeof(uint) * threadsPerBlock  + 256;
-
-    NDArray::prepareSpecialUse({&vol}, {&col});
-    BUILD_SINGLE_SELECTOR(vol.dataType(), col2volCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), col.getSpecialBuffer(), col.getSpecialShapeInfo(), vol.specialBuffer(), vol.specialShapeInfo(), sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&vol}, {&col});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Y>
-static void conv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-    // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
-    // bias    [oC]
-    // output  [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
-
-    // kH  filter(kernel) height
-    // kW  filter(kernel) width
-    // sH  strides height
-    // sW  strides width
-    // pH  paddings height
-    // pW  paddings width
-    // dH  dilations height
-    // dW  dilations width
-    // paddingMode 0-VALID, 1-SAME
-    // isNCHW     1-NCHW,  0-NHWC
-
-    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
-    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
-
-    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
-
-    std::vector<int> permutForOutput;
-
-    if(isNCHW)
-        permutForOutput = {0, 3, 1, 2};                                             // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
-    else
-        input = new NDArray(input->permute({0, 3, 1, 2}));                         // [bS, iH, iW, iC] -> [bS, iC, iH, iW] if NHWC
-
-    std::vector<int> wAxes;
-    if(0 == wFormat)
-        wAxes = {0, 1, 2};
-    else if(1 == wFormat)
-        wAxes = {2, 3, 1};
-    else
-        wAxes = {1, 2, 3};
-
-    NDArray col('c', {bS, oH, oW, kH, kW, iC}, input->dataType(), input->getContext());
-    NDArray colP = col.permute({0, 5, 3, 4, 1, 2});            // {bS, iC, kH, kW, oH, oW}
-    NDArray mmulResult('f', {bS*oH*oW, oC}, output->dataType(), output->getContext());
-
-    //----- calculation of output -----//
-    auto ctx = block.launchContext();
-    helpers::im2col(*ctx, *input, colP, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-    MmulHelper::tensorDot(&col, weights, &mmulResult, {3,4,5}, wAxes, {}); // [bS, oH, oW, kH, kW, iC] x [kH, kW, iC, oC] = [bS, oH, oW, oC]
-
-    //----- assign outTemp to output  -----//
-    if(isNCHW) {
-        mmulResult.reshapei({bS, oH, oW, oC});
-        mmulResult.permutei(permutForOutput);
-    }
-    output->assign(mmulResult);
-
-    //----- add biases if required -----//
-    if(bias)
-        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
-        helpers::addBias(block, *output, *bias, *output, isNCHW);
-
-    if(!isNCHW)
-        delete input;
-
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::conv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Y>
-static void depthwiseConv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-    // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    // weights   [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-    // bias      [oC] = iC*mC
-    // output    [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
-
-    // kH           filter(kernel) height
-    // kW           filter(kernel) width
-    // sH           strides height
-    // sW           strides width
-    // pH           paddings height
-    // pW           paddings width
-    // dH           dilations height
-    // dW           dilations width
-    // paddingMode   0-VALID, 1-SAME
-    // isNCHW       0-NCHW,  1-NHWC
-
-    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
-    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
-    mC = weights->sizeAt(indWmC);                           // channels multiplier
-
-    std::vector<std::vector<Nd4jLong>> modifColumns = {{1,0,4,5,2,3}, {iC,bS*oH*oW,kH*kW}};  // [bS,iC,kH,kW,oH,oW] -> [iC,bS,oH,oW,kH,kW] -> [iC,bS*oH*oW,kH*kW]
-    std::vector<std::vector<Nd4jLong>> modifOutput, modifWeights;
-    std::vector<Nd4jLong> outReShape;
-
-    if(!isNCHW) {
-        outReShape = {bS, oH, oW, iC, mC};                                              // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
-        modifOutput = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-        input = new NDArray(input->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
-    }
-    else {
-        outReShape = {bS, iC, mC, oH, oW};                                              // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
-        modifOutput = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-    }
-
-    if(0 == wFormat)
-        modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
-    else if(1 == wFormat)
-        modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
-    else
-        modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
-
-    if(paddingMode == 1)                       // SAME
-        ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
-
-    NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-    NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
-
-    helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-    MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, modifWeights, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
-
-    if(bias)
-        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
-        helpers::addBias(block, *output, *bias, *output, isNCHW);
-
-    if(!isNCHW)
-        delete input;
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::depthwiseConv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Y>
-static void sconv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-    // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
-    // weightsDepth  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-    // weightsPoint  [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
-    // bias          [oC], oC = iC*mC if weightsPoint=nullptr
-    // output is     [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)
-
-    //  kH         filter(kernel) height
-    //  kW         filter(kernel) width
-    //  sH         strides height
-    //  sW         strides width
-    //  pH         paddings height
-    //  pW         paddings width
-    //  dH         dilations height
-    //  dW         dilations width
-    //  paddingMode 0-VALID, 1-SAME
-    //  isNCHW     1-NCHW,  0-NHWC
-
-    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
-    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
-    mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier
-
-    NDArray* outputDepth = output;
-    if(weightsPoint)                        // if pointwise convolution is expected
-        outputDepth = new NDArray(output->ordering(), !isNCHW ? std::vector<Nd4jLong>({bS, oH, oW, iC*mC}) : std::vector<Nd4jLong>({bS, iC*mC, oH, oW}), input->dataType(), input->getContext());
-
-    // ----- perform depthwise convolution (if weightsPoint is absent then oC = iC*mC) ----- //
-    ConvolutionUtils::depthwiseConv2d(block, input, weightsDepth, weightsPoint ? nullptr : bias, outputDepth, kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, isNCHW, wFormat);
-
-    // ----- perform pointwise convolution (oH = iH, oW = iW) ----- //
-    if (weightsPoint) {
-        ConvolutionUtils::conv2d(block, outputDepth, weightsPoint, bias, output, 1,1, 1,1, 0,0, 1,1, paddingMode, isNCHW, wFormat);             // in this case oH=iH, oW=iW
-        delete outputDepth;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::sconv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z>
-static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
-
-    // input is  [bS, iC, iH, iW]
-    // output is [bS, iC, oH, oW]
-
-    const auto x = reinterpret_cast<const X*>(vx);
-          auto z = reinterpret_cast<Z*>(vz);
-
-    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
-
-    if (threadIdx.x == 0) {
-        bS = shape::sizeAt(xShapeInfo, 0);
-        iC = shape::sizeAt(xShapeInfo, 1);
-        oH = shape::sizeAt(zShapeInfo, 2);
-        oW = shape::sizeAt(zShapeInfo, 3);
-        iH = shape::sizeAt(xShapeInfo, 2);
-        iW = shape::sizeAt(xShapeInfo, 3);
-
-        strideB = shape::stride(xShapeInfo)[0];
-        strideC = shape::stride(xShapeInfo)[1];
-        strideY = shape::stride(xShapeInfo)[2];
-        strideX = shape::stride(xShapeInfo)[3];
-
-        strideOB = shape::stride(zShapeInfo)[0];
-        strideOC = shape::stride(zShapeInfo)[1];
-        strideOY = shape::stride(zShapeInfo)[2];
-        strideOX = shape::stride(zShapeInfo)[3];
-
-        length = shape::length(zShapeInfo);
-
-        //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
-        kHEff = kH + (kH-1)*(dH-1);
-        kWEff = kW + (kW-1)*(dW-1);
-    }
-    __syncthreads();
-
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
-
-        const int pw = index % oW;
-        const int ph = (index / oW) % oH;
-        const int c = (index / oW / oH) % iC;
-        const int n = index / oW / oH / iC;
-
-        int hstart = sH * ph - pH;
-        int wstart = sW * pw - pW;
-        int hend = hstart + kHEff;
-        int wend = wstart + kWEff;
-
-        if(hstart < 0){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
-            hstart += f * dH;
-        }
-        if(wstart < 0){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
-            wstart += f * dW;
-        }
-        if(hend > iH){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
-            hend -= f * dH;
-        }
-        if(wend > iW){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
-            wend -= f * dW;
-        }
-
-        //Accounts for dilation
-        int pool_size = sd::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * sd::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
-
-        Z sum = 0.0f;
-
-        const X *inSlice = x + (n * strideB + c * strideC);
-
-        for (int h = hstart; h < hend; h += dH)
-            for (int w = wstart; w < wend; w += dW)
-                sum += static_cast<Z>(inSlice[h * strideY + w * strideX]);
-
-        int divide_factor = pool_size;  //Case 0: exclude padding
-        if (extraParam0 == 1)     //Case 1: include padding
-            divide_factor = kH * kW;
-
-        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = sum / static_cast<Z>(divide_factor);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z>
-static void avgPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
-    avgPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z>
-static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
-
-    // input is  [bS, iC, iH, iW]
-    // output is [bS, iC, oH, oW]
-
-    const auto x = reinterpret_cast<const X*>(vx);
-          auto z = reinterpret_cast<Z*>(vz);
-
-    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
-    __shared__ bool fOrder;
-
-    if (threadIdx.x == 0) {
-        bS = shape::sizeAt(xShapeInfo, 0);
-        iC = shape::sizeAt(xShapeInfo, 1);
-        oH = shape::sizeAt(zShapeInfo, 2);
-        oW = shape::sizeAt(zShapeInfo, 3);
-        iH = shape::sizeAt(xShapeInfo, 2);
-        iW = shape::sizeAt(xShapeInfo, 3);
-
-        strideB = shape::stride(xShapeInfo)[0];
-        strideC = shape::stride(xShapeInfo)[1];
-        strideY = shape::stride(xShapeInfo)[2];
-        strideX = shape::stride(xShapeInfo)[3];
-
-        strideOB = shape::stride(zShapeInfo)[0];
-        strideOC = shape::stride(zShapeInfo)[1];
-        strideOY = shape::stride(zShapeInfo)[2];
-        strideOX = shape::stride(zShapeInfo)[3];
-
-        length = shape::length(zShapeInfo);
-
-        //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
-        kHEff = kH + (kH-1)*(dH-1);
-        kWEff = kW + (kW-1)*(dW-1);
-    }
-    __syncthreads();
-
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
-
-        const int pw = index % oW;
-        const int ph = (index / oW) % oH;
-        const int c = (index / oW / oH) % iC;
-        const int n = index / oW / oH / iC;
-
-        int hstart = sH * ph - pH;
-        int wstart = sW * pw - pW;
-        int hend = hstart + kHEff;
-        int wend = wstart + kWEff;
-
-        if (hstart < 0) {
-            int f = sd::math::nd4j_ceil<Z, int>((Z) -hstart / (Z) dH);
-            hstart += f * dH;
-        }
-        if (wstart < 0) {
-            int f = sd::math::nd4j_ceil<Z, int>((Z) -wstart / (Z) dW);
-            wstart += f * dW;
-        }
-        if (hend > iH) {
-            int f = sd::math::nd4j_ceil<Z, int>((Z) (hend - iH) / (Z) dH);
-            hend -= f * dH;
-        }
-        if (wend > iW) {
-            int f = sd::math::nd4j_ceil<Z, int>((Z) (wend - iW) / (Z) dW);
-            wend -= f * dW;
-        }
-        //Accounts for dilation
-        int pool_size = sd::math::nd4j_ceil<double, int>((double) (hend - hstart) / (double) dH) *
-                        sd::math::nd4j_ceil<double, int>((double) (wend - wstart) / (double) dW);
-
-        Z sum = 0.f;
-
-        const X *inSlice = x + (n * strideB + c * strideC);
-
-        for (int h = hstart; h < hend; h += dH)
-            for (int w = wstart; w < wend; w += dW)
-                sum += sd::math::nd4j_pow<Z, Z, Z>(static_cast<Z>(sd::math::nd4j_abs<X>(inSlice[h * strideY + w * strideX])), extraParam0);
-
-        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = sd::math::nd4j_pow<Z, Z, Z>(sum, (Z) 1.0f / extraParam0);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z>
-static void pnormPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
-    pnormPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z>
-static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
-
-    // input is  [bS, iC, iH, iW]
-    // output is [bS, iC, oH, oW]
-
-    const auto x = reinterpret_cast<const X*>(vx);
-          auto z = reinterpret_cast<Z*>(vz);
-
-    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
-    __shared__ bool fOrder;
-
-    if (threadIdx.x == 0) {
-        bS = shape::sizeAt(xShapeInfo, 0);
-        iC = shape::sizeAt(xShapeInfo, 1);
-        oH = shape::sizeAt(zShapeInfo, 2);
-        oW = shape::sizeAt(zShapeInfo, 3);
-        iH = shape::sizeAt(xShapeInfo, 2);
-        iW = shape::sizeAt(xShapeInfo, 3);
-
-        strideB = shape::stride(xShapeInfo)[0];
-        strideC = shape::stride(xShapeInfo)[1];
-        strideY = shape::stride(xShapeInfo)[2];
-        strideX = shape::stride(xShapeInfo)[3];
-
-        strideOB = shape::stride(zShapeInfo)[0];
-        strideOC = shape::stride(zShapeInfo)[1];
-        strideOY = shape::stride(zShapeInfo)[2];
-        strideOX = shape::stride(zShapeInfo)[3];
-
-        length = shape::length(zShapeInfo);
-
-        //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
-        kHEff = kH + (kH-1)*(dH-1);
-        kWEff = kW + (kW-1)*(dW-1);
-    }
-    __syncthreads();
-
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
-
-        const int pw = index % oW;
-        const int ph = (index / oW) % oH;
-        const int c = (index / oW / oH) % iC;
-        const int n = index / oW / oH / iC;
-
-        int hstart = sH * ph - pH;
-        int wstart = sW * pw - pW;
-        int hend = hstart + kHEff;
-        int wend = wstart + kWEff;
-
-        if(hstart < 0){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
-            hstart += f * dH;
-        }
-        if(wstart < 0){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
-            wstart += f * dW;
-        }
-        if(hend > iH){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
-            hend -= f * dH;
-        }
-        if(wend > iW){
-            int f = sd::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
-            wend -= f * dW;
-        }
-        //Accounts for dilation
-        int pool_size = sd::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * sd::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
-
-        Z max = -sd::DataTypeUtils::max<Z>();
-
-        const X *inSlice = x + (n * strideB + c * strideC);
-
-        for (int h = hstart; h < hend; h += dH) {
-            for (int w = wstart; w < wend; w += dW) {
-                Z v = static_cast<Z>(inSlice[h * strideY + w * strideX]);
-                if (v > max)
-                    max = v;
-            }
-        }
-
-        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = max;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Z>
-static void maxPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
-    maxPooling2dCuda<X,Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
-
-    if(!input.isActualOnDeviceSide()) input.syncToDevice();
-
-    switch (poolingMode) {
-
-        case MAX_POOL: {
-                BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), maxPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
-            }
-            break;
-        case AVG_POOL: {
-                BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), avgPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
-            }
-            break;
-        case PNORM_POOL: {
-                BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), pnormPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
-            }
-            break;
-        default:
-            throw std::runtime_error("Pooling2D: Unknown PoolingType used");
-    }
-
-    output.tickWriteDevice();
-    input.tickReadDevice();
-
-    auto result = cudaStreamSynchronize(*block.launchContext()->getCudaStream());
-    if (result != 0)
-        throw cuda_exception::build("Pooling2D failed", result);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-
-    // x input  is [bS, iC, iD, iH, iW]
-    // z output is [bS, iC, oD, oH, oW]
-
-    const T* x = reinterpret_cast<const T*>(vx);
-          T* z = reinterpret_cast<T*>(vz);
-
-    __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
-    __shared__ Nd4jLong zLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        zLen = shape::length(zShapeInfo);
-        rank = 5;
-
-        kDeff = kD + (kD - 1) * (dD - 1);
-        kHeff = kH + (kH - 1) * (dH - 1);
-        kWeff = kW + (kW - 1) * (dW - 1);
-
-        iD = xShapeInfo[3];
-        iH = xShapeInfo[4];
-        iW = xShapeInfo[5];
-
-        kProd = kD * kH * kW;
-    }
-    __syncthreads();
-
-    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(zInd >= zLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(zInd, zShapeInfo, coords);
-
-    const auto zOffset = shape::getOffset(zShapeInfo, coords);
-
-    int dstart = coords[2] * sD - pD;
-    int hstart = coords[3] * sH - pH;
-    int wstart = coords[4] * sW - pW;
-    int dend = dstart + kDeff;
-    int hend = hstart + kHeff;
-    int wend = wstart + kWeff;
-
-    if(dstart < 0)
-        dstart += dD * ((-dstart + dD - 1) / dD);
-    if(hstart < 0)
-        hstart += dH * ((-hstart + dH - 1) / dH);
-    if(wstart < 0)
-        wstart += dW * ((-wstart + dW - 1) / dW);
-    if(dend > iD)
-        dend -= dD * ((dend - iD + dD - 1) / dD);
-    if(hend > iH)
-        hend -= dH * ((hend - iH + dH - 1) / dH);
-    if(wend > iW)
-        wend -= dW * ((wend - iW + dW - 1) / dW);
-
-
-    switch (poolingMode) {
-
-        /*** max ***/
-        case 0: {
-            T max = -DataTypeUtils::max<T>();
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        T val = x[shape::getOffset(xShapeInfo, coords)];
-                        if (val > max)
-                            max = val;
-                    }
-                }
-            }
-            z[zOffset] = max;
-        }
-        break;
-
-        /*** avg ***/
-        case 1: {
-            T sum = static_cast<T>(0.);
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += x[shape::getOffset(xShapeInfo, coords)];
-
-            if (extraParam0 == 0) {         //Exclude padding
-                uint a = (dend - dstart) / dD + ((dend - dstart) % dD == 0 ? 0 : 1);
-                uint b = (hend - hstart) / dH + ((hend - hstart) % dH == 0 ? 0 : 1);
-                uint c = (wend - wstart) / dW + ((wend - wstart) % dW == 0 ? 0 : 1);
-                sum /=  static_cast<T>(a * b * c);                                       //  /= sd::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
-            }
-            else if (extraParam0 == 1)    //Include padding
-                sum /= kProd;
-
-            z[zOffset] = sum;
-        }
-        break;
-
-        /*** pnorm ***/
-        case 2: {
-            T sum = static_cast<T>(0.);
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
-
-            sum = sd::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
-
-            z[zOffset] = sum;
-        }
-        break;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void pooling3dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                const void* vx, const Nd4jLong* xShapeInfo,
-                                      void* vz, const Nd4jLong* zShapeInfo,
-                                const int kD, const int kH, const int kW,
-                                const int sD, const int sH, const int sW,
-                                const int pD, const int pH, const int pW,
-                                const int dD, const int dH, const int dW,
-                                const int poolingMode, const int extraParam0) {
-
-    pooling3dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-
-    PointersManager manager(block.launchContext(), "pooling3d");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
-
-    NDArray::prepareSpecialUse({&output}, {&input});
-    BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&output}, {&input});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-
-    // x: input [bS, iC, iH, iW]
-    // y: gradO [bS, iC, oH, oW]
-    // z: gradI [bS, iC, iH, iW] -> gradI is output in this function
-
-    const T* x = reinterpret_cast<const T*>(vx);
-    const T* y = reinterpret_cast<const T*>(vy);
-          T* z = reinterpret_cast<T*>(vz);
-
-    Nd4jLong coord2, coord3;
-    __shared__ int rank, kHeff, kWeff, iH, iW, kProd;
-    __shared__ Nd4jLong yLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        yLen = shape::length(yShapeInfo);
-        rank = 4;
-
-        kHeff = kH + (kH - 1) * (dH - 1);
-        kWeff = kW + (kW - 1) * (dW - 1);
-
-        iH = xShapeInfo[3];
-        iW = xShapeInfo[4];
-
-        kProd = kH * kW;
-    }
-    __syncthreads();
-
-    const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(yInd >= yLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(yInd, yShapeInfo, coords);
-
-    const auto yOffset = shape::getOffset(yShapeInfo, coords);
-
-    int hstart = coords[2] * sH - pH;
-    int wstart = coords[3] * sW - pW;
-    int hend = hstart + kHeff;
-    int wend = wstart + kWeff;
-    if(hstart < 0)
-        hstart += dH * ((-hstart + dH - 1) / dH);
-    if(wstart < 0)
-        wstart += dW * ((-wstart + dW - 1) / dW);
-    if(hend > iH)
-        hend -= dH * ((hend - iH + dH - 1) / dH);
-    if(wend > iW)
-        wend -= dW * ((wend - iW + dW - 1) / dW);
-
-
-    switch (poolingMode) {
-
-        /*** max ***/
-        case 0: {
-            coord2 = hstart;
-            coord3 = wstart;
-
-            T max = -DataTypeUtils::max<T>();
-            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
-                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW){
-                    T val = x[shape::getOffset(xShapeInfo, coords)];
-                    if (val > max) {
-                        max = val;
-                        coord2 = coords[2];
-                        coord3 = coords[3];
-                    }
-                }
-            }
-            coords[2] = coord2;
-            coords[3] = coord3;
-            auto zOffset = shape::getOffset(zShapeInfo, coords);
-            sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], y[yOffset]);
-            //z[zOffset] += y[yOffset];
-        }
-        break;
-
-        /*** avg ***/
-        case 1: {
-
-            T val = y[yOffset];
-
-            if (extraParam0 == 0)         //Exclude padding
-                val /= sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
-            else if (extraParam0 == 1)    //Include padding
-                val /= kProd;
-
-            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
-                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
-        }
-        break;
-
-        /*** pnorm ***/
-        case 2: {
-
-            T sum = static_cast<T>(0.);
-            T val = y[yOffset];
-
-            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
-                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
-
-            val *= sd::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
-
-            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
-                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
-                    const auto xOffset = shape::getOffset(xShapeInfo, coords);
-                    const auto zOffset = shape::getOffset(zShapeInfo, coords);
-                    sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * sd::math::nd4j_sgn<T,T>(x[xOffset]));
-                }
-            }
-        }
-        break;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void pooling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                    const void* vx, const Nd4jLong* xShapeInfo,
-                                    const void* vy, const Nd4jLong* yShapeInfo,
-                                          void* vz, const Nd4jLong* zShapeInfo,
-                                    const int kH, const int kW,
-                                    const int sH, const int sW,
-                                    const int pH, const int pW,
-                                    const int dH, const int dW,
-                                    const int poolingMode, const int extraParam0) {
-
-    pooling2dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling2dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-
-    // initial zeroing of gradI
-    gradI.nullify();
-
-    PointersManager manager(block.launchContext(), "pooling2dBP");
-
-    const int threadsPerBlock = 256;
-    const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = gradO.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
-
-    NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
-    BUILD_SINGLE_SELECTOR(input.dataType(), pooling2dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-
-    // x: input [bS, iC, iD, iH, iW]
-    // y: gradO [bS, iC, oD, oH, oW]
-    // z: gradI [bS, iC, iD, iH, iW] -> gradI is output in this function
-
-
-    const T* x = reinterpret_cast<const T*>(vx);
-    const T* y = reinterpret_cast<const T*>(vy);
-          T* z = reinterpret_cast<T*>(vz);
-
-    Nd4jLong coord2, coord3, coord4;
-    __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
-    __shared__ Nd4jLong yLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        yLen = shape::length(yShapeInfo);
-        rank = 5;
-
-        kDeff = kD + (kD - 1) * (dD - 1);
-        kHeff = kH + (kH - 1) * (dH - 1);
-        kWeff = kW + (kW - 1) * (dW - 1);
-
-        iD = xShapeInfo[3];
-        iH = xShapeInfo[4];
-        iW = xShapeInfo[5];
-
-        kProd = kD * kH * kW;
-    }
-    __syncthreads();
-
-    const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(yInd >= yLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(yInd, yShapeInfo, coords);
-
-    const auto yOffset = shape::getOffset(yShapeInfo, coords);
-
-    int dstart = coords[2] * sD - pD;
-    int hstart = coords[3] * sH - pH;
-    int wstart = coords[4] * sW - pW;
-    int dend = dstart + kDeff;
-    int hend = hstart + kHeff;
-    int wend = wstart + kWeff;
-
-    if(dstart < 0)
-        dstart += dD * ((-dstart + dD - 1) / dD);
-    if(hstart < 0)
-        hstart += dH * ((-hstart + dH - 1) / dH);
-    if(wstart < 0)
-        wstart += dW * ((-wstart + dW - 1) / dW);
-    if(dend > iD)
-        dend -= dD * ((dend - iD + dD - 1) / dD);
-    if(hend > iH)
-        hend -= dH * ((hend - iH + dH - 1) / dH);
-    if(wend > iW)
-        wend -= dW * ((wend - iW + dW - 1) / dW);
-
-
-    switch (poolingMode) {
-
-        /*** max ***/
-        case 0: {
-
-            T max = -DataTypeUtils::max<T>();
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        T val = x[shape::getOffset(xShapeInfo, coords)];
-                        if (val > max) {
-                            max = val;
-                            coord2 = coords[2];
-                            coord3 = coords[3];
-                            coord4 = coords[4];
-                        }
-                    }
-                }
-            }
-            coords[2] = coord2;
-            coords[3] = coord3;
-            coords[4] = coord4;
-            sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]);
-        }
-        break;
-
-        /*** avg ***/
-        case 1: {
-
-            T val = y[yOffset];
-
-            if (extraParam0 == 0)         //Exclude padding
-                val /= sd::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD))  * sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH))     * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart)    / static_cast<double>(dW));   //Accounts for dilation
-            else if (extraParam0 == 1)    //Include padding
-                val /= kProd;
-
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
-        }
-        break;
-
-        /*** pnorm ***/
-        case 2: {
-
-            T sum = static_cast<T>(0.);
-            T val = y[yOffset];
-
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
-
-            val *= sd::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
-
-            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
-                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
-                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        const auto xOffset = shape::getOffset(xShapeInfo, coords);
-                        const auto zOffset = shape::getOffset(zShapeInfo, coords);
-                        sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * sd::math::nd4j_sgn<T,T>(x[xOffset]));
-                    }
-                }
-            }
-        }
-        break;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void pooling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                    const void* vx, const Nd4jLong* xShapeInfo,
-                                    const void* vy, const Nd4jLong* yShapeInfo,
-                                          void* vz, const Nd4jLong* zShapeInfo,
-                                    const int kD, const int kH, const int kW,
-                                    const int sD, const int sH, const int sW,
-                                    const int pD, const int pH, const int pW,
-                                    const int dD, const int dH, const int dW,
-                                    const int poolingMode, const int extraParam0) {
-
-    pooling3dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::pooling3dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
-
-    // initial zeroing of gradI
-    gradI.nullify();
-
-    PointersManager manager(block.launchContext(), "pooling3dBP");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = gradO.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
-
-    NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
-    BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Y>
-static void conv2dBP_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-    // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
-    // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
-    // bias    [oC]
-    // gradO   [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
-
-    // gradI    [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
-    // gradW    [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
-    // gradB    [oC]
-
-    // kH         filter(kernel) height
-    // kW         filter(kernel) width
-    // sH         strides height
-    // sW         strides width
-    // pH         paddings height
-    // pW         paddings width
-    // dH         dilations height
-    // dW         dilations width
-    // paddingMode 0-VALID, 1-SAME
-    // isNCHW     0-NHWC, 1-NCHW
-
-    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
-    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
-
-    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
-
-    std::vector<int> gradOaxesForDot;
-
-    if(!isNCHW) {
-        gradOaxesForDot  = {0, 1, 2};                                           // bS, oH, oW
-        input = new NDArray(input->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
-        gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
-    } else {
-        gradOaxesForDot  = {0, 2, 3};                                           // bS, oH, oW
-    }
-
-    std::vector<int> wPermut, colPermut;
-    if(0 == wFormat) {
-        wPermut   = {2, 0, 1, 3};
-        colPermut = {2, 3, 1, 0, 4, 5};
-    }
-    else if(1 == wFormat) {
-        wPermut   = {1, 2, 3, 0};
-        colPermut = {1, 2, 3, 0, 4, 5};
-    }
-    else {
-        wPermut   = {3, 1, 2, 0};
-        colPermut = {2, 3, 1, 0, 4, 5};
-    }
-
-    NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-
-    // ----- calculation of gradW ----- //
-    if(gradW) {
-        auto ctx = block.launchContext();
-        helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-        sd::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, wPermut);       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
-    }
-
-    // ----- calculation of gradB ----- //
-    if(gradB) {
-        NDArray* gradBR = gradB;
-        if(gradB->rankOf() == 2)
-            gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot, false);                          // sum over bS, oH, oW
-        if(gradBR != gradB)
-            delete gradBR;
-    }
-
-    //----- calculation of gradI -----//
-    // [kH, kW, iC, oC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
-    // [oC, iC, kH, kW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, bS, oH, oW]
-    // [oC, kH, kW, iC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
-    sd::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, colPermut);  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
-
-    helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                          // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-
-    if(!isNCHW) {
-        delete input;
-        delete gradI;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::conv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename X, typename Y>
-static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-
-    // input    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
-    // weights  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-    // bias     [oC] = [iC*mC]
-    // gradO    [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
-    // gradI    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
-    // gradW    [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
-    // gradB    [oC]
-
-    //  kH          filter(kernel) height
-    //  kW          filter(kernel) width
-    //  sH          strides height
-    //  sW          strides width
-    //  pH          paddings height
-    //  pW          paddings width
-    //  dH          dilations height
-    //  dW          dilations width
-    //  paddingMode  0-VALID, 1-SAME
-    //  isNCHW      0-NHWC, 1-NCHW
-
-    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
-    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
-    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
-    mC = weights->sizeAt(indWmC);                           // channels multiplier
-
-    std::vector<std::vector<Nd4jLong>> modifColumns = {{1,2,3,0,4,5}, {iC, kH*kW, bS*oH*oW}};      // [bS,iC,kH,kW,oH,oW] -> [iC, kH*kW, bS*oH*oW]
-    std::vector<std::vector<Nd4jLong>> modifGradO1, modifGradO2, modifWeights;
-    std::vector<Nd4jLong> gradOreShape;
-
-    if(!isNCHW) {
-        gradOreShape = {bS, oH, oW, iC, mC};                                            // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
-        modifGradO1 = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-        modifGradO2 = {{3,0,1,2},{iC, mC, bS*oH*oW}};                                   // [bS,oH,oW,iC*mC] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
-        input = new NDArray(input->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
-        gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
-    }
-    else {
-        gradOreShape = {bS, iC, mC, oH, oW};                                            // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
-        modifGradO1 = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
-        modifGradO2 = {{1,0,2,3},{iC, mC, bS*oH*oW}};                                   // [bS,iC*mC,oH,oW] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
-    }
-
-    if(0 == wFormat)
-        modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
-    else if(1 == wFormat)
-        modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
-    else
-        modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
-
-    if(paddingMode == 1)                       // SAME
-        ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
-
-    NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
-    NDArray gradOreshaped = gradO->reshape(gradO->ordering(), gradOreShape);
-
-    // ----- calculation of gradW and gradB ----- //
-
-    helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
-    sd::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, modifWeights);  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
-
-    // ----- calculation of gradB ----- //
-    if(gradB) {
-        NDArray* gradBR = gradB;
-        if(gradB->rankOf() == 2)
-            gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
-        gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}, false);                      // sum over bS, oH, oW
-        if(gradBR != gradB)
-            delete gradBR;
-    }
-
-    //----- calculation of gradI -----//
-    sd::MmulHelper::tensorDot(weights, gradO, &columns, modifWeights, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
-    helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                                       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
-
-    if(!isNCHW) {
-        delete input;
-        delete gradI;
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::depthwiseConv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
-    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int factorH, const int factorW, const bool isNCHW) {
-
-    // x has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
-    // z has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
-
-    const T* x = reinterpret_cast<const T*>(vx);
-          T* z = reinterpret_cast<T*>(vz);
-
-    __shared__ int rank, dimIH;
-    __shared__ Nd4jLong zLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        dimIH  = isNCHW ? 2 : 1;
-        zLen   = shape::length(zShapeInfo);
-        rank   = 4;
-    }
-    __syncthreads();
-
-    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(zInd >= zLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(zInd, zShapeInfo, coords);
-
-    const auto zOffset = shape::getOffset(zShapeInfo, coords);
-
-    coords[dimIH]     /= factorH;
-    coords[dimIH + 1] /= factorW;
-
-    const auto xOffset = shape::getOffset(xShapeInfo, coords);
-
-    z[zOffset] = x[xOffset];
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void upsampling2dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                     const void* vx, const Nd4jLong* xShapeInfo,
-                                           void* vz, const Nd4jLong* zShapeInfo,
-                                     const int factorH, const int factorW, const bool isNCHW) {
-
-    upsampling2dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, factorH, factorW, isNCHW);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
-
-    PointersManager manager(block.launchContext(), "upsampling2d");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
-
-    NDArray::prepareSpecialUse({&output}, {&input});
-    BUILD_SINGLE_SELECTOR(input.dataType(), upsampling2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), factorH, factorW, isNCHW), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&output}, {&input});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
-
-    // x has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
-    // z has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
-
-    const T* x = reinterpret_cast<const T*>(vx);
-          T* z = reinterpret_cast<T*>(vz);
-
-    __shared__ int rank, dimID;
-    __shared__ Nd4jLong zLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        dimID = isNCDHW ? 2 : 1;
-        zLen  = shape::length(zShapeInfo);
-        rank  = 5;
-    }
-    __syncthreads();
-
-    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(zInd >= zLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(zInd, zShapeInfo, coords);
-
-    const auto zOffset = shape::getOffset(zShapeInfo, coords);
-
-    coords[dimID]     /= factorD;
-    coords[dimID + 1] /= factorH;
-    coords[dimID + 2] /= factorW;
-
-    const auto xOffset = shape::getOffset(xShapeInfo, coords);
-
-    z[zOffset] = x[xOffset];
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void upsampling3dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                     const void* vx, const Nd4jLong* xShapeInfo,
-                                           void* vz, const Nd4jLong* zShapeInfo,
-                                     const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
-
-    upsampling3dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, factorD, factorH, factorW, isNCDHW);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
-
-    PointersManager manager(block.launchContext(), "upsampling3d");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
-
-    NDArray::prepareSpecialUse({&output}, {&input});
-    BUILD_SINGLE_SELECTOR(input.dataType(), upsampling3dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), factorD, factorH, factorW, isNCDHW), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&output}, {&input});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool isNCHW) {
-
-    // x (gradO) has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
-    // z (gradI) has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
-
-    const T* x = reinterpret_cast<const T*>(vx);
-          T* z = reinterpret_cast<T*>(vz);
-
-    __shared__ int rank, dimIH;
-    __shared__ uint factorH, factorW;
-    __shared__ Nd4jLong zLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        dimIH = isNCHW ? 2 : 1;
-        zLen  = shape::length(zShapeInfo);
-        rank  = 4;
-
-        factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
-        factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
-    }
-    __syncthreads();
-
-    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(zInd >= zLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(zInd, zShapeInfo, coords);
-
-    const auto zOffset = shape::getOffset(zShapeInfo, coords);
-
-    z[zOffset] = 0;
-
-    const Nd4jLong zCoord2 = coords[dimIH]     * factorH;
-    const Nd4jLong zCoord3 = coords[dimIH + 1] * factorW;
-
-    for(coords[dimIH] = zCoord2; coords[dimIH] < zCoord2 + factorH; ++coords[dimIH])
-        for(coords[dimIH + 1] = zCoord3; coords[dimIH + 1] < zCoord3 + factorW; ++coords[dimIH + 1])
-            z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void upsampling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                       const void* vx, const Nd4jLong* xShapeInfo,
-                                             void* vz, const Nd4jLong* zShapeInfo,
-                                       const bool isNCHW) {
-
-    upsampling2dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, isNCHW);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling2dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
-
-    PointersManager manager(block.launchContext(), "upsampling2d_bp");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = gradI.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
-
-    NDArray::prepareSpecialUse({&gradI}, {&gradO});
-    BUILD_SINGLE_SELECTOR(gradI.dataType(), upsampling2dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), isNCHW), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&gradI}, {&gradO});
-
-    manager.synchronize();
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-__global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool isNCDHW) {
-
-    // x (gradO) has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
-    // z (gradI) has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
-
-    const T* x = reinterpret_cast<const T*>(vx);
-          T* z = reinterpret_cast<T*>(vz);
-
-    __shared__ int rank, dimID;
-    __shared__ uint factorD, factorH, factorW;
-    __shared__ Nd4jLong zLen, *sharedMem;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        dimID = isNCDHW ? 2 : 1;
-        zLen  = shape::length(zShapeInfo);
-        rank  = 5;
-
-        factorD = xShapeInfo[dimID + 1] / zShapeInfo[dimID + 1];
-        factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
-        factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
-    }
-    __syncthreads();
-
-    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(zInd >= zLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * rank;
-
-    shape::index2coords(zInd, zShapeInfo, coords);
-
-    const auto zOffset = shape::getOffset(zShapeInfo, coords);
-
-    z[zOffset] = 0;
-
-    const Nd4jLong zCoord2 = coords[dimID]     * factorD;
-    const Nd4jLong zCoord3 = coords[dimID + 1] * factorH;
-    const Nd4jLong zCoord4 = coords[dimID + 2] * factorW;
-
-    for(coords[dimID] = zCoord2; coords[dimID] < zCoord2 + factorD; ++coords[dimID])
-        for(coords[dimID + 1] = zCoord3; coords[dimID + 1] < zCoord3 + factorH; ++coords[dimID + 1])
-            for(coords[dimID + 2] = zCoord4; coords[dimID + 2] < zCoord4 + factorW; ++coords[dimID + 2])
-                z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
-}
-
-//////////////////////////////////////////////////////////////////////////
-template <typename T>
-static void upsampling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                       const void* vx, const Nd4jLong* xShapeInfo,
-                                             void* vz, const Nd4jLong* zShapeInfo,
-                                       const bool isNCDHW) {
-
-    upsampling3dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, isNCDHW);
-}
-
-//////////////////////////////////////////////////////////////////////////
-void ConvolutionUtils::upsampling3dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
-
-    PointersManager manager(block.launchContext(), "upsampling3d_bp");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = gradI.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
-
-    NDArray::prepareSpecialUse({&gradI}, {&gradO});
-    BUILD_SINGLE_SELECTOR(gradI.dataType(), upsampling3dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), isNCDHW), FLOAT_TYPES);
-    NDArray::registerSpecialUse({&gradI}, {&gradO});
-
-    manager.synchronize();
-}
-
-
-
-
-
-
-
-
-
-}
-}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_col2vol.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_col2vol.cu
new file mode 100644
index 000000000..d751c2b1e
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_col2vol.cu
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+#include <math/templatemath.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+// columns [bS, iC, kD, kH, kW, oD, oH, oW] to be de-convoluted to volume [bS, iC, iD, iH, iW]
+template <typename T>
+static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShapeInfo, void* volume, const Nd4jLong* volShapeInfo,  const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+    const T* col = reinterpret_cast<const T*>(columns);
+          T* vol = reinterpret_cast<T*>(volume);
+
+    __shared__ uint kD, kH, kW, oD, oH, oW, *sharedMem;
+    __shared__ Nd4jLong volLen;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<uint*>(shmem);
+
+        oD = colShapeInfo[6];
+        oH = colShapeInfo[7];
+        oW = colShapeInfo[8];
+
+        kD = dD * (colShapeInfo[3] - 1) + 1;
+        kH = dH * (colShapeInfo[4] - 1) + 1;
+        kW = dW * (colShapeInfo[5] - 1) + 1;
+
+        volLen  = shape::length(volShapeInfo);
+    }
+    __syncthreads();
+
+    auto coords = sharedMem + threadIdx.x * 8;
+
+    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for (Nd4jLong i = tid; i < volLen; i += gridDim.x * blockDim.x) {
+
+        shape::index2coords(i, volShapeInfo, coords);
+
+        const auto volOffset = shape::getOffset(volShapeInfo, coords);
+
+        const auto bSiCoffset = coords[0] * colShapeInfo[9] + coords[1] * colShapeInfo[10];
+
+        const uint imD = coords[2] + pD;
+        const uint imH = coords[3] + pH;
+        const uint imW = coords[4] + pW;
+
+        const uint colDstart = (imD < kD) ? 0 : (imD - kD) / sD + 1;
+        const uint colHstart = (imH < kH) ? 0 : (imH - kH) / sH + 1;
+        const uint colWstart = (imW < kW) ? 0 : (imW - kW) / sW + 1;
+
+        const uint colDend = sd::math::nd4j_min<uint>(imD / sD + 1, oD);
+        const uint colHend = sd::math::nd4j_min<uint>(imH / sH + 1, oH);
+        const uint colWend = sd::math::nd4j_min<uint>(imW / sW + 1, oW);
+
+        T val = 0;
+
+        for(uint colD = colDstart; colD < colDend; ++colD) {
+            coords[2] = imD - colD * sD;
+            if(coords[2] % dD != 0) continue;
+
+            for(uint colH = colHstart; colH < colHend; ++colH) {
+                coords[3] = imH - colH * sH;
+                if(coords[3] % dH != 0) continue;
+
+                for(uint colW = colWstart; colW < colWend; ++colW) {
+                    coords[4] = imW - colW * sW;
+                    if(coords[4] % dW != 0) continue;
+
+                    val += col[bSiCoffset + (coords[2]/dD)*colShapeInfo[11] + (coords[3]/dH)*colShapeInfo[12] + (coords[4]/dW)*colShapeInfo[13] + colD*colShapeInfo[14] + colH*colShapeInfo[15] + colW*colShapeInfo[16]];
+
+                }
+            }
+        }
+
+        vol[volOffset] = val;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void col2volCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                const void* columns, const Nd4jLong* colShapeInfo,
+                                      void* volume, const Nd4jLong* volShapeInfo,
+                                const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+    col2volCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(columns, colShapeInfo, volume, volShapeInfo, sD, sH, sW, pD, pH, pW, dD, dH, dW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::col2vol(sd::graph::Context& block, const NDArray& col, NDArray& vol, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+    PointersManager manager(block.launchContext(), "col2vol");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 4;
+    const int blocksPerGrid = (vol.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = col.rankOf() * sizeof(uint) * threadsPerBlock  + 256;
+
+    NDArray::prepareSpecialUse({&vol}, {&col});
+    BUILD_SINGLE_SELECTOR(vol.dataType(), col2volCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), col.getSpecialBuffer(), col.getSpecialShapeInfo(), vol.specialBuffer(), vol.specialShapeInfo(), sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&vol}, {&col});
+
+    manager.synchronize();
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_conv2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_conv2d.cu
new file mode 100644
index 000000000..494ce4a81
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_conv2d.cu
@@ -0,0 +1,105 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include<ops/declarable/helpers/addBias.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void conv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+    // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+    // bias    [oC]
+    // output  [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+
+    // kH  filter(kernel) height
+    // kW  filter(kernel) width
+    // sH  strides height
+    // sW  strides width
+    // pH  paddings height
+    // pW  paddings width
+    // dH  dilations height
+    // dW  dilations width
+    // paddingMode 0-VALID, 1-SAME
+    // isNCHW     1-NCHW,  0-NHWC
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<int> permutForOutput;
+
+    if(isNCHW)
+        permutForOutput = {0, 3, 1, 2};                                             // [bS, oH, oW, oC] -> [bS, oC, oH, oW]
+    else
+        input = new NDArray(input->permute({0, 3, 1, 2}));                         // [bS, iH, iW, iC] -> [bS, iC, iH, iW] if NHWC
+
+    std::vector<int> wAxes;
+    if(0 == wFormat)
+        wAxes = {0, 1, 2};
+    else if(1 == wFormat)
+        wAxes = {2, 3, 1};
+    else
+        wAxes = {1, 2, 3};
+
+    NDArray col('c', {bS, oH, oW, kH, kW, iC}, input->dataType(), input->getContext());
+    NDArray colP = col.permute({0, 5, 3, 4, 1, 2});            // {bS, iC, kH, kW, oH, oW}
+    NDArray mmulResult('f', {bS*oH*oW, oC}, output->dataType(), output->getContext());
+
+    //----- calculation of output -----//
+    auto ctx = block.launchContext();
+    helpers::im2col(*ctx, *input, colP, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+    MmulHelper::tensorDot(&col, weights, &mmulResult, {3,4,5}, wAxes, {}); // [bS, oH, oW, kH, kW, iC] x [kH, kW, iC, oC] = [bS, oH, oW, oC]
+
+    //----- assign outTemp to output  -----//
+    if(isNCHW) {
+        mmulResult.reshapei({bS, oH, oW, oC});
+        mmulResult.permutei(permutForOutput);
+    }
+    output->assign(mmulResult);
+
+    //----- add biases if required -----//
+    if(bias)
+        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        helpers::addBias(block, *output, *bias, *output, isNCHW);
+
+    if(!isNCHW)
+        delete input;
+
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::conv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_conv2dBP.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_conv2dBP.cu
new file mode 100644
index 000000000..dbf4ee390
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_conv2dBP.cu
@@ -0,0 +1,125 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include<ops/declarable/helpers/addBias.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void conv2dBP_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+    // input   [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    // weights [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+    // bias    [oC]
+    // gradO   [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW), epsilon_next
+
+    // gradI    [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW), epsilon
+    // gradW    [kH, kW, iC, oC], [oC, iC, kH, kW], [oC, kH, kW, iC]
+    // gradB    [oC]
+
+    // kH         filter(kernel) height
+    // kW         filter(kernel) width
+    // sH         strides height
+    // sW         strides width
+    // pH         paddings height
+    // pW         paddings width
+    // dH         dilations height
+    // dW         dilations width
+    // paddingMode 0-VALID, 1-SAME
+    // isNCHW     0-NHWC, 1-NCHW
+
+    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
+    int indIOioC, indIiH, indWoC, indWiC, indWkH, indOoH;       // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWoC, indWkH, indOoH);
+
+    ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW, paddingMode);
+
+    std::vector<int> gradOaxesForDot;
+
+    if(!isNCHW) {
+        gradOaxesForDot  = {0, 1, 2};                                           // bS, oH, oW
+        input = new NDArray(input->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
+        gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                      // [bS, iH, iW, iC] -> [bS, iC, iH, iW]
+    } else {
+        gradOaxesForDot  = {0, 2, 3};                                           // bS, oH, oW
+    }
+
+    std::vector<int> wPermut, colPermut;
+    if(0 == wFormat) {
+        wPermut   = {2, 0, 1, 3};
+        colPermut = {2, 3, 1, 0, 4, 5};
+    }
+    else if(1 == wFormat) {
+        wPermut   = {1, 2, 3, 0};
+        colPermut = {1, 2, 3, 0, 4, 5};
+    }
+    else {
+        wPermut   = {3, 1, 2, 0};
+        colPermut = {2, 3, 1, 0, 4, 5};
+    }
+
+    NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
+
+    // ----- calculation of gradW ----- //
+    if(gradW) {
+        auto ctx = block.launchContext();
+        helpers::im2col(*ctx, *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));   // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+        sd::MmulHelper::tensorDot(&columns, gradO, gradW, {0,4,5}, gradOaxesForDot, wPermut);       // [bS, iC, kH, kW, oH, oW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, oC]
+    }
+
+    // ----- calculation of gradB ----- //
+    if(gradB) {
+        NDArray* gradBR = gradB;
+        if(gradB->rankOf() == 2)
+            gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
+        gradO->reduceAlongDimension(reduce::Sum, *gradBR, gradOaxesForDot, false);                          // sum over bS, oH, oW
+        if(gradBR != gradB)
+            delete gradBR;
+    }
+
+    //----- calculation of gradI -----//
+    // [kH, kW, iC, oC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+    // [oC, iC, kH, kW] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [iC, kH, kW, bS, oH, oW]
+    // [oC, kH, kW, iC] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+    sd::MmulHelper::tensorDot(weights, gradO, &columns, {indWoC}, {indIOioC}, colPermut);  // [kH, kW, iC, oC]/[oC, iC, kH, kW]] x [bS, oH, oW, oC]/[bS, oC, oH, oW] = [kH, kW, iC, bS, oH, oW]
+
+    helpers::col2im(*block.launchContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                          // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
+
+    if(!isNCHW) {
+        delete input;
+        delete gradI;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::conv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_depthwiseConv2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_depthwiseConv2d.cu
new file mode 100644
index 000000000..bbf5d5892
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_depthwiseConv2d.cu
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include<ops/declarable/helpers/addBias.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void depthwiseConv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+    // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
+    // weights   [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    // bias      [oC] = iC*mC
+    // output    [bS, oH, oW, iC*mC] (NHWC) or [bS, iC*mC, oH, oW] (NCHW)
+
+    // kH           filter(kernel) height
+    // kW           filter(kernel) width
+    // sH           strides height
+    // sW           strides width
+    // pH           paddings height
+    // pW           paddings width
+    // dH           dilations height
+    // dW           dilations width
+    // paddingMode   0-VALID, 1-SAME
+    // isNCHW       0-NCHW,  1-NHWC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+    std::vector<std::vector<Nd4jLong>> modifColumns = {{1,0,4,5,2,3}, {iC,bS*oH*oW,kH*kW}};  // [bS,iC,kH,kW,oH,oW] -> [iC,bS,oH,oW,kH,kW] -> [iC,bS*oH*oW,kH*kW]
+    std::vector<std::vector<Nd4jLong>> modifOutput, modifWeights;
+    std::vector<Nd4jLong> outReShape;
+
+    if(!isNCHW) {
+        outReShape = {bS, oH, oW, iC, mC};                                              // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
+        modifOutput = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+        input = new NDArray(input->permute({0, 3, 1, 2}));                              // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
+    }
+    else {
+        outReShape = {bS, iC, mC, oH, oW};                                              // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
+        modifOutput = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+    }
+
+    if(0 == wFormat)
+        modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
+    else if(1 == wFormat)
+        modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
+    else
+        modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
+
+    if(paddingMode == 1)                       // SAME
+        ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
+
+    NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
+    NDArray outputReshaped = output->reshape(output->ordering(), outReShape, false);
+
+    helpers::im2col(*output->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+    MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, modifWeights, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
+
+    if(bias)
+        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        helpers::addBias(block, *output, *bias, *output, isNCHW);
+
+    if(!isNCHW)
+        delete input;
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::depthwiseConv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_depthwiseConv2dBP.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_depthwiseConv2dBP.cu
new file mode 100644
index 000000000..b06af6166
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_depthwiseConv2dBP.cu
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/im2col.h>
+#include <ops/declarable/helpers/col2im.h>
+#include <helpers/MmulHelper.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void depthwiseConv2dBP_(const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+    // input    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW)
+    // weights  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    // bias     [oC] = [iC*mC]
+    // gradO    [bS, oH, oW, oC] (NDHWC) or [bS, oC, oH, oW] (NCDHW), epsilon_next
+    // gradI    [bS, iH, iW, iC] (NDHWC) or [bS, iC, iH, iW] (NCDHW), epsilon
+    // gradW    [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    // gradB    [oC]
+
+    //  kH          filter(kernel) height
+    //  kW          filter(kernel) width
+    //  sH          strides height
+    //  sW          strides width
+    //  pH          paddings height
+    //  pW          paddings width
+    //  dH          dilations height
+    //  dW          dilations width
+    //  paddingMode  0-VALID, 1-SAME
+    //  isNCHW      0-NHWC, 1-NCHW
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier(oC = iC*mC), output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *gradO, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weights->sizeAt(indWmC);                           // channels multiplier
+
+    std::vector<std::vector<Nd4jLong>> modifColumns = {{1,2,3,0,4,5}, {iC, kH*kW, bS*oH*oW}};      // [bS,iC,kH,kW,oH,oW] -> [iC, kH*kW, bS*oH*oW]
+    std::vector<std::vector<Nd4jLong>> modifGradO1, modifGradO2, modifWeights;
+    std::vector<Nd4jLong> gradOreShape;
+
+    if(!isNCHW) {
+        gradOreShape = {bS, oH, oW, iC, mC};                                            // [bS,oH,oW,iC*mC] -> [bS,oH,oW,iC,mC]
+        modifGradO1 = {{3,0,1,2,4},{iC, bS*oH*oW, mC}};                                 // [bS,oH,oW,iC,mC] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+        modifGradO2 = {{3,0,1,2},{iC, mC, bS*oH*oW}};                                   // [bS,oH,oW,iC*mC] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
+        input = new NDArray(input->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
+        gradI = new NDArray(gradI->permute({0, 3, 1, 2}));                             // [bS,iH,iW,iC]    -> [bS,iC,iH,iW]
+    }
+    else {
+        gradOreShape = {bS, iC, mC, oH, oW};                                            // [bS,iC*mC,oH,oW] -> [bS,iC,mC,oH,oW]
+        modifGradO1 = {{1,0,3,4,2},{iC, bS*oH*oW, mC}};                                 // [bS,iC,mC,oH,oW] -> [iC,bS,oH,oW,mC] -> [iC,bS*oH*oW,mC]
+        modifGradO2 = {{1,0,2,3},{iC, mC, bS*oH*oW}};                                   // [bS,iC*mC,oH,oW] -> [iC*mC,bS,oH,oW] -> [iC,mC,bS*oH*oW]
+    }
+
+    if(0 == wFormat)
+        modifWeights = {{2,0,1,3},{iC,kH*kW,mC}};
+    else if(1 == wFormat)
+        modifWeights = {{1,2,3,0},{iC,kH*kW,mC}};
+    else
+        modifWeights = {{3,1,2,0},{iC,kH*kW,mC}};
+
+    if(paddingMode == 1)                       // SAME
+        ConvolutionUtils::calcPadding2D(pH, pW, oH, oW, iH, iW, kH, kW, sH, sW, dH, dW);
+
+    NDArray columns(input->ordering(), {bS, iC, kH, kW, oH, oW}, input->dataType(), input->getContext());
+    NDArray gradOreshaped = gradO->reshape(gradO->ordering(), gradOreShape);
+
+    // ----- calculation of gradW and gradB ----- //
+
+    helpers::im2col(*input->getContext(), *input, columns, kH, kW, sH, sW, pH, pW, dH, dW, NDArrayFactory::create(0.f, input->getContext()));  // [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
+    sd::MmulHelper::tensorDot(&columns, &gradOreshaped, gradW, modifColumns, modifGradO1, modifWeights);  // [iC, kW*kH, bS*oH*oW] x [iC, bS*oH*oW, mC] = [iC, kH*kW, mC]
+
+    // ----- calculation of gradB ----- //
+    if(gradB) {
+        NDArray* gradBR = gradB;
+        if(gradB->rankOf() == 2)
+            gradBR = new NDArray(gradB->reshape(gradB->ordering(), {(int)gradB->lengthOf()}));
+        gradO->reduceAlongDimension(reduce::Sum, *gradBR, {0,indOoH,indOoH+1}, false);                      // sum over bS, oH, oW
+        if(gradBR != gradB)
+            delete gradBR;
+    }
+
+    //----- calculation of gradI -----//
+    sd::MmulHelper::tensorDot(weights, gradO, &columns, modifWeights, modifGradO2, modifColumns); // [iC, kH*kW, mC] x [iC, mC, bS*oH*oW] = [iC, kW*kH, bS*oH*oW]
+    helpers::col2im(*input->getContext(), columns, *gradI, sH, sW, pH, pW, iH, iW, dH, dW);                                       // [bS, iC, kH, kW, oH, oW] is de-convoluted to [bS, iC, iH, iW]
+
+    if(!isNCHW) {
+        delete input;
+        delete gradI;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::depthwiseConv2dBP(sd::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling2d.cu
new file mode 100644
index 000000000..eb336cb76
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling2d.cu
@@ -0,0 +1,342 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <exceptions/cuda_exception.h>
+#include <helpers/PointersManager.h>
+#include <math/templatemath.h>
+
+namespace sd {
+namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+static __global__ void avgPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+
+    // input is  [bS, iC, iH, iW]
+    // output is [bS, iC, oH, oW]
+
+    const auto x = reinterpret_cast<const X*>(vx);
+          auto z = reinterpret_cast<Z*>(vz);
+
+    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
+
+    if (threadIdx.x == 0) {
+        bS = shape::sizeAt(xShapeInfo, 0);
+        iC = shape::sizeAt(xShapeInfo, 1);
+        oH = shape::sizeAt(zShapeInfo, 2);
+        oW = shape::sizeAt(zShapeInfo, 3);
+        iH = shape::sizeAt(xShapeInfo, 2);
+        iW = shape::sizeAt(xShapeInfo, 3);
+
+        strideB = shape::stride(xShapeInfo)[0];
+        strideC = shape::stride(xShapeInfo)[1];
+        strideY = shape::stride(xShapeInfo)[2];
+        strideX = shape::stride(xShapeInfo)[3];
+
+        strideOB = shape::stride(zShapeInfo)[0];
+        strideOC = shape::stride(zShapeInfo)[1];
+        strideOY = shape::stride(zShapeInfo)[2];
+        strideOX = shape::stride(zShapeInfo)[3];
+
+        length = shape::length(zShapeInfo);
+
+        //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
+        kHEff = kH + (kH-1)*(dH-1);
+        kWEff = kW + (kW-1)*(dW-1);
+    }
+    __syncthreads();
+
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
+
+        const int pw = index % oW;
+        const int ph = (index / oW) % oH;
+        const int c = (index / oW / oH) % iC;
+        const int n = index / oW / oH / iC;
+
+        int hstart = sH * ph - pH;
+        int wstart = sW * pw - pW;
+        int hend = hstart + kHEff;
+        int wend = wstart + kWEff;
+
+        if(hstart < 0){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
+            hstart += f * dH;
+        }
+        if(wstart < 0){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
+            wstart += f * dW;
+        }
+        if(hend > iH){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
+            hend -= f * dH;
+        }
+        if(wend > iW){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
+            wend -= f * dW;
+        }
+
+        //Accounts for dilation
+        int pool_size = sd::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * sd::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
+
+        Z sum = 0.0f;
+
+        const X *inSlice = x + (n * strideB + c * strideC);
+
+        for (int h = hstart; h < hend; h += dH)
+            for (int w = wstart; w < wend; w += dW)
+                sum += static_cast<Z>(inSlice[h * strideY + w * strideX]);
+
+        int divide_factor = pool_size;  //Case 0: exclude padding
+        if (extraParam0 == 1)     //Case 1: include padding
+            divide_factor = kH * kW;
+
+        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = sum / static_cast<Z>(divide_factor);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+static void avgPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+    avgPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+static __global__ void pnormPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+
+    // input is  [bS, iC, iH, iW]
+    // output is [bS, iC, oH, oW]
+
+    const auto x = reinterpret_cast<const X*>(vx);
+          auto z = reinterpret_cast<Z*>(vz);
+
+    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
+    __shared__ bool fOrder;
+
+    if (threadIdx.x == 0) {
+        bS = shape::sizeAt(xShapeInfo, 0);
+        iC = shape::sizeAt(xShapeInfo, 1);
+        oH = shape::sizeAt(zShapeInfo, 2);
+        oW = shape::sizeAt(zShapeInfo, 3);
+        iH = shape::sizeAt(xShapeInfo, 2);
+        iW = shape::sizeAt(xShapeInfo, 3);
+
+        strideB = shape::stride(xShapeInfo)[0];
+        strideC = shape::stride(xShapeInfo)[1];
+        strideY = shape::stride(xShapeInfo)[2];
+        strideX = shape::stride(xShapeInfo)[3];
+
+        strideOB = shape::stride(zShapeInfo)[0];
+        strideOC = shape::stride(zShapeInfo)[1];
+        strideOY = shape::stride(zShapeInfo)[2];
+        strideOX = shape::stride(zShapeInfo)[3];
+
+        length = shape::length(zShapeInfo);
+
+        //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
+        kHEff = kH + (kH-1)*(dH-1);
+        kWEff = kW + (kW-1)*(dW-1);
+    }
+    __syncthreads();
+
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
+
+        const int pw = index % oW;
+        const int ph = (index / oW) % oH;
+        const int c = (index / oW / oH) % iC;
+        const int n = index / oW / oH / iC;
+
+        int hstart = sH * ph - pH;
+        int wstart = sW * pw - pW;
+        int hend = hstart + kHEff;
+        int wend = wstart + kWEff;
+
+        if (hstart < 0) {
+            int f = sd::math::nd4j_ceil<Z, int>((Z) -hstart / (Z) dH);
+            hstart += f * dH;
+        }
+        if (wstart < 0) {
+            int f = sd::math::nd4j_ceil<Z, int>((Z) -wstart / (Z) dW);
+            wstart += f * dW;
+        }
+        if (hend > iH) {
+            int f = sd::math::nd4j_ceil<Z, int>((Z) (hend - iH) / (Z) dH);
+            hend -= f * dH;
+        }
+        if (wend > iW) {
+            int f = sd::math::nd4j_ceil<Z, int>((Z) (wend - iW) / (Z) dW);
+            wend -= f * dW;
+        }
+        //Accounts for dilation
+        int pool_size = sd::math::nd4j_ceil<double, int>((double) (hend - hstart) / (double) dH) *
+                        sd::math::nd4j_ceil<double, int>((double) (wend - wstart) / (double) dW);
+
+        Z sum = 0.f;
+
+        const X *inSlice = x + (n * strideB + c * strideC);
+
+        for (int h = hstart; h < hend; h += dH)
+            for (int w = wstart; w < wend; w += dW)
+                sum += sd::math::nd4j_pow<Z, Z, Z>(static_cast<Z>(sd::math::nd4j_abs<X>(inSlice[h * strideY + w * strideX])), extraParam0);
+
+        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = sd::math::nd4j_pow<Z, Z, Z>(sum, (Z) 1.0f / extraParam0);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+static void pnormPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+    pnormPooling2dCuda<X, Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+static __global__ void maxPooling2dCuda(const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+
+    // input is  [bS, iC, iH, iW]
+    // output is [bS, iC, oH, oW]
+
+    const auto x = reinterpret_cast<const X*>(vx);
+          auto z = reinterpret_cast<Z*>(vz);
+
+    __shared__ int bS, iC, oH, oW, iH, iW, strideB, strideC, strideY, strideX, strideOB, strideOC, strideOY, strideOX, length, kHEff, kWEff;
+    __shared__ bool fOrder;
+
+    if (threadIdx.x == 0) {
+        bS = shape::sizeAt(xShapeInfo, 0);
+        iC = shape::sizeAt(xShapeInfo, 1);
+        oH = shape::sizeAt(zShapeInfo, 2);
+        oW = shape::sizeAt(zShapeInfo, 3);
+        iH = shape::sizeAt(xShapeInfo, 2);
+        iW = shape::sizeAt(xShapeInfo, 3);
+
+        strideB = shape::stride(xShapeInfo)[0];
+        strideC = shape::stride(xShapeInfo)[1];
+        strideY = shape::stride(xShapeInfo)[2];
+        strideX = shape::stride(xShapeInfo)[3];
+
+        strideOB = shape::stride(zShapeInfo)[0];
+        strideOC = shape::stride(zShapeInfo)[1];
+        strideOY = shape::stride(zShapeInfo)[2];
+        strideOX = shape::stride(zShapeInfo)[3];
+
+        length = shape::length(zShapeInfo);
+
+        //Replace kernel H/W with *effective* kernel H/W accounting for dilatyon
+        kHEff = kH + (kH-1)*(dH-1);
+        kWEff = kW + (kW-1)*(dW-1);
+    }
+    __syncthreads();
+
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for (int index = tid; index < length; index += blockDim.x * gridDim.x) {
+
+        const int pw = index % oW;
+        const int ph = (index / oW) % oH;
+        const int c = (index / oW / oH) % iC;
+        const int n = index / oW / oH / iC;
+
+        int hstart = sH * ph - pH;
+        int wstart = sW * pw - pW;
+        int hend = hstart + kHEff;
+        int wend = wstart + kWEff;
+
+        if(hstart < 0){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -hstart / (Z)dH);
+            hstart += f * dH;
+        }
+        if(wstart < 0){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) -wstart / (Z) dW);
+            wstart += f * dW;
+        }
+        if(hend > iH){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (hend-iH) / (Z) dH);
+            hend -= f * dH;
+        }
+        if(wend > iW){
+            int f = sd::math::nd4j_ceil<Z,int>((Z) (wend-iW) / (Z) dW);
+            wend -= f * dW;
+        }
+        //Accounts for dilation
+        int pool_size = sd::math::nd4j_ceil<double,int>((double) (hend-hstart) / (double) dH) * sd::math::nd4j_ceil<double,int>((double) (wend-wstart) / (double) dW);
+
+        Z max = -sd::DataTypeUtils::max<Z>();
+
+        const X *inSlice = x + (n * strideB + c * strideC);
+
+        for (int h = hstart; h < hend; h += dH) {
+            for (int w = wstart; w < wend; w += dW) {
+                Z v = static_cast<Z>(inSlice[h * strideY + w * strideX]);
+                if (v > max)
+                    max = v;
+            }
+        }
+
+        z[n * strideOB + c * strideOC + pw * strideOX + ph * strideOY] = max;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Z>
+static void maxPooling2dCudaLauncher(sd::LaunchContext & block, void *vx, Nd4jLong *vxShapeInfo, void *vz, Nd4jLong *vzShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int extraParam0) {
+    maxPooling2dCuda<X,Z><<<512, 512, 4192, *block.getCudaStream()>>>(vx, vxShapeInfo, vz, vzShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, extraParam0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::pooling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const PoolingType poolingMode, const int extraParam0) {
+
+    if(!input.isActualOnDeviceSide()) input.syncToDevice();
+
+    switch (poolingMode) {
+
+        case MAX_POOL: {
+                BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), maxPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
+            }
+            break;
+        case AVG_POOL: {
+                BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), avgPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
+            }
+            break;
+        case PNORM_POOL: {
+                BUILD_SINGLE_SELECTOR_TWICE(input.dataType(), pnormPooling2dCudaLauncher, (*block.launchContext(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, extraParam0), FLOAT_TYPES);
+            }
+            break;
+        default:
+            throw std::runtime_error("Pooling2D: Unknown PoolingType used");
+    }
+
+    output.tickWriteDevice();
+    input.tickReadDevice();
+
+    auto result = cudaStreamSynchronize(*block.launchContext()->getCudaStream());
+    if (result != 0)
+        throw cuda_exception::build("Pooling2D failed", result);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling2dBP.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling2dBP.cu
new file mode 100644
index 000000000..26808ad4c
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling2dBP.cu
@@ -0,0 +1,188 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+#include <math/templatemath.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+
+    // x: input [bS, iC, iH, iW]
+    // y: gradO [bS, iC, oH, oW]
+    // z: gradI [bS, iC, iH, iW] -> gradI is output in this function
+
+    const T* x = reinterpret_cast<const T*>(vx);
+    const T* y = reinterpret_cast<const T*>(vy);
+          T* z = reinterpret_cast<T*>(vz);
+
+    Nd4jLong coord2, coord3;
+    __shared__ int rank, kHeff, kWeff, iH, iW, kProd;
+    __shared__ Nd4jLong yLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        yLen = shape::length(yShapeInfo);
+        rank = 4;
+
+        kHeff = kH + (kH - 1) * (dH - 1);
+        kWeff = kW + (kW - 1) * (dW - 1);
+
+        iH = xShapeInfo[3];
+        iW = xShapeInfo[4];
+
+        kProd = kH * kW;
+    }
+    __syncthreads();
+
+    const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(yInd >= yLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(yInd, yShapeInfo, coords);
+
+    const auto yOffset = shape::getOffset(yShapeInfo, coords);
+
+    int hstart = coords[2] * sH - pH;
+    int wstart = coords[3] * sW - pW;
+    int hend = hstart + kHeff;
+    int wend = wstart + kWeff;
+    if(hstart < 0)
+        hstart += dH * ((-hstart + dH - 1) / dH);
+    if(wstart < 0)
+        wstart += dW * ((-wstart + dW - 1) / dW);
+    if(hend > iH)
+        hend -= dH * ((hend - iH + dH - 1) / dH);
+    if(wend > iW)
+        wend -= dW * ((wend - iW + dW - 1) / dW);
+
+
+    switch (poolingMode) {
+
+        /*** max ***/
+        case 0: {
+            coord2 = hstart;
+            coord3 = wstart;
+
+            T max = -DataTypeUtils::max<T>();
+            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
+                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW){
+                    T val = x[shape::getOffset(xShapeInfo, coords)];
+                    if (val > max) {
+                        max = val;
+                        coord2 = coords[2];
+                        coord3 = coords[3];
+                    }
+                }
+            }
+            coords[2] = coord2;
+            coords[3] = coord3;
+            auto zOffset = shape::getOffset(zShapeInfo, coords);
+            sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], y[yOffset]);
+            //z[zOffset] += y[yOffset];
+        }
+        break;
+
+        /*** avg ***/
+        case 1: {
+
+            T val = y[yOffset];
+
+            if (extraParam0 == 0)         //Exclude padding
+                val /= sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
+            else if (extraParam0 == 1)    //Include padding
+                val /= kProd;
+
+            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
+                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
+                    sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
+        }
+        break;
+
+        /*** pnorm ***/
+        case 2: {
+
+            T sum = static_cast<T>(0.);
+            T val = y[yOffset];
+
+            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
+                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
+                    sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
+
+            val *= sd::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+
+            for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
+                for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
+                    const auto xOffset = shape::getOffset(xShapeInfo, coords);
+                    const auto zOffset = shape::getOffset(zShapeInfo, coords);
+                    sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * sd::math::nd4j_sgn<T,T>(x[xOffset]));
+                }
+            }
+        }
+        break;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void pooling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                    const void* vx, const Nd4jLong* xShapeInfo,
+                                    const void* vy, const Nd4jLong* yShapeInfo,
+                                          void* vz, const Nd4jLong* zShapeInfo,
+                                    const int kH, const int kW,
+                                    const int sH, const int sW,
+                                    const int pH, const int pW,
+                                    const int dH, const int dW,
+                                    const int poolingMode, const int extraParam0) {
+
+    pooling2dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::pooling2dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kH, const int kW, const int sH, const int sW, const int pH, const int pW, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+
+    // initial zeroing of gradI
+    gradI.nullify();
+
+    PointersManager manager(block.launchContext(), "pooling2dBP");
+
+    const int threadsPerBlock = 256;
+    const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = gradO.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+
+    NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
+    BUILD_SINGLE_SELECTOR(input.dataType(), pooling2dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), kH, kW, sH, sW, pH, pW, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
+
+    manager.synchronize();
+}
+
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling3d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling3d.cu
new file mode 100644
index 000000000..93e372a7e
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling3d.cu
@@ -0,0 +1,181 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+#include <math/templatemath.h>
+
+namespace sd {
+namespace ops  {
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+
+    // x input  is [bS, iC, iD, iH, iW]
+    // z output is [bS, iC, oD, oH, oW]
+
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
+
+    __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
+    __shared__ Nd4jLong zLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        zLen = shape::length(zShapeInfo);
+        rank = 5;
+
+        kDeff = kD + (kD - 1) * (dD - 1);
+        kHeff = kH + (kH - 1) * (dH - 1);
+        kWeff = kW + (kW - 1) * (dW - 1);
+
+        iD = xShapeInfo[3];
+        iH = xShapeInfo[4];
+        iW = xShapeInfo[5];
+
+        kProd = kD * kH * kW;
+    }
+    __syncthreads();
+
+    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(zInd >= zLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(zInd, zShapeInfo, coords);
+
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
+
+    int dstart = coords[2] * sD - pD;
+    int hstart = coords[3] * sH - pH;
+    int wstart = coords[4] * sW - pW;
+    int dend = dstart + kDeff;
+    int hend = hstart + kHeff;
+    int wend = wstart + kWeff;
+
+    if(dstart < 0)
+        dstart += dD * ((-dstart + dD - 1) / dD);
+    if(hstart < 0)
+        hstart += dH * ((-hstart + dH - 1) / dH);
+    if(wstart < 0)
+        wstart += dW * ((-wstart + dW - 1) / dW);
+    if(dend > iD)
+        dend -= dD * ((dend - iD + dD - 1) / dD);
+    if(hend > iH)
+        hend -= dH * ((hend - iH + dH - 1) / dH);
+    if(wend > iW)
+        wend -= dW * ((wend - iW + dW - 1) / dW);
+
+
+    switch (poolingMode) {
+
+        /*** max ***/
+        case 0: {
+            T max = -DataTypeUtils::max<T>();
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
+                        T val = x[shape::getOffset(xShapeInfo, coords)];
+                        if (val > max)
+                            max = val;
+                    }
+                }
+            }
+            z[zOffset] = max;
+        }
+        break;
+
+        /*** avg ***/
+        case 1: {
+            T sum = static_cast<T>(0.);
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
+                        sum += x[shape::getOffset(xShapeInfo, coords)];
+
+            if (extraParam0 == 0) {         //Exclude padding
+                uint a = (dend - dstart) / dD + ((dend - dstart) % dD == 0 ? 0 : 1);
+                uint b = (hend - hstart) / dH + ((hend - hstart) % dH == 0 ? 0 : 1);
+                uint c = (wend - wstart) / dW + ((wend - wstart) % dW == 0 ? 0 : 1);
+                sum /=  static_cast<T>(a * b * c);                                       //  /= sd::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH)) * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart) / static_cast<double>(dW));   //Accounts for dilation
+            }
+            else if (extraParam0 == 1)    //Include padding
+                sum /= kProd;
+
+            z[zOffset] = sum;
+        }
+        break;
+
+        /*** pnorm ***/
+        case 2: {
+            T sum = static_cast<T>(0.);
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
+                        sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
+
+            sum = sd::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
+
+            z[zOffset] = sum;
+        }
+        break;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void pooling3dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                const void* vx, const Nd4jLong* xShapeInfo,
+                                      void* vz, const Nd4jLong* zShapeInfo,
+                                const int kD, const int kH, const int kW,
+                                const int sD, const int sH, const int sW,
+                                const int pD, const int pH, const int pW,
+                                const int dD, const int dH, const int dW,
+                                const int poolingMode, const int extraParam0) {
+
+    pooling3dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::pooling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+
+    PointersManager manager(block.launchContext(), "pooling3d");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+
+    NDArray::prepareSpecialUse({&output}, {&input});
+    BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&output}, {&input});
+
+    manager.synchronize();
+}
+
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling3dBP.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling3dBP.cu
new file mode 100644
index 000000000..51b48bc23
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_pooling3dBP.cu
@@ -0,0 +1,202 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+#include <math/templatemath.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+
+    // x: input [bS, iC, iD, iH, iW]
+    // y: gradO [bS, iC, oD, oH, oW]
+    // z: gradI [bS, iC, iD, iH, iW] -> gradI is output in this function
+
+
+    const T* x = reinterpret_cast<const T*>(vx);
+    const T* y = reinterpret_cast<const T*>(vy);
+          T* z = reinterpret_cast<T*>(vz);
+
+    Nd4jLong coord2, coord3, coord4;
+    __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
+    __shared__ Nd4jLong yLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        yLen = shape::length(yShapeInfo);
+        rank = 5;
+
+        kDeff = kD + (kD - 1) * (dD - 1);
+        kHeff = kH + (kH - 1) * (dH - 1);
+        kWeff = kW + (kW - 1) * (dW - 1);
+
+        iD = xShapeInfo[3];
+        iH = xShapeInfo[4];
+        iW = xShapeInfo[5];
+
+        kProd = kD * kH * kW;
+    }
+    __syncthreads();
+
+    const auto yInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(yInd >= yLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(yInd, yShapeInfo, coords);
+
+    const auto yOffset = shape::getOffset(yShapeInfo, coords);
+
+    int dstart = coords[2] * sD - pD;
+    int hstart = coords[3] * sH - pH;
+    int wstart = coords[4] * sW - pW;
+    int dend = dstart + kDeff;
+    int hend = hstart + kHeff;
+    int wend = wstart + kWeff;
+
+    if(dstart < 0)
+        dstart += dD * ((-dstart + dD - 1) / dD);
+    if(hstart < 0)
+        hstart += dH * ((-hstart + dH - 1) / dH);
+    if(wstart < 0)
+        wstart += dW * ((-wstart + dW - 1) / dW);
+    if(dend > iD)
+        dend -= dD * ((dend - iD + dD - 1) / dD);
+    if(hend > iH)
+        hend -= dH * ((hend - iH + dH - 1) / dH);
+    if(wend > iW)
+        wend -= dW * ((wend - iW + dW - 1) / dW);
+
+
+    switch (poolingMode) {
+
+        /*** max ***/
+        case 0: {
+
+            T max = -DataTypeUtils::max<T>();
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
+                        T val = x[shape::getOffset(xShapeInfo, coords)];
+                        if (val > max) {
+                            max = val;
+                            coord2 = coords[2];
+                            coord3 = coords[3];
+                            coord4 = coords[4];
+                        }
+                    }
+                }
+            }
+            coords[2] = coord2;
+            coords[3] = coord3;
+            coords[4] = coord4;
+            sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]);
+        }
+        break;
+
+        /*** avg ***/
+        case 1: {
+
+            T val = y[yOffset];
+
+            if (extraParam0 == 0)         //Exclude padding
+                val /= sd::math::nd4j_ceil<double,T>(static_cast<double>(dend - dstart) / static_cast<double>(dD))  * sd::math::nd4j_ceil<double,T>(static_cast<double>(hend - hstart) / static_cast<double>(dH))     * sd::math::nd4j_ceil<double,T>(static_cast<double>(wend - wstart)    / static_cast<double>(dW));   //Accounts for dilation
+            else if (extraParam0 == 1)    //Include padding
+                val /= kProd;
+
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
+                        sd::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
+        }
+        break;
+
+        /*** pnorm ***/
+        case 2: {
+
+            T sum = static_cast<T>(0.);
+            T val = y[yOffset];
+
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
+                        sum += sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
+
+            val *= sd::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
+
+            for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
+                for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
+                    for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
+                        const auto xOffset = shape::getOffset(xShapeInfo, coords);
+                        const auto zOffset = shape::getOffset(zShapeInfo, coords);
+                        sd::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * sd::math::nd4j_pow<T,T,T>(sd::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * sd::math::nd4j_sgn<T,T>(x[xOffset]));
+                    }
+                }
+            }
+        }
+        break;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void pooling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                    const void* vx, const Nd4jLong* xShapeInfo,
+                                    const void* vy, const Nd4jLong* yShapeInfo,
+                                          void* vz, const Nd4jLong* zShapeInfo,
+                                    const int kD, const int kH, const int kW,
+                                    const int sD, const int sH, const int sW,
+                                    const int pD, const int pH, const int pW,
+                                    const int dD, const int dH, const int dW,
+                                    const int poolingMode, const int extraParam0) {
+
+    pooling3dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::pooling3dBP(sd::graph::Context& block, const NDArray& input, const NDArray& gradO, NDArray& gradI, const int kD, const int kH, const int kW, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW, const int poolingMode, const int extraParam0) {
+
+    // initial zeroing of gradI
+    gradI.nullify();
+
+    PointersManager manager(block.launchContext(), "pooling3dBP");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = gradO.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+
+    NDArray::prepareSpecialUse({&gradI}, {&input, &gradO});
+    BUILD_SINGLE_SELECTOR(input.dataType(), pooling3dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), kD, kH, kW, sD, sH, sW, pD, pH, pW, dD, dH, dW, poolingMode, extraParam0), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&gradI}, {&input, &gradO});
+
+    manager.synchronize();
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_sconv2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_sconv2d.cu
new file mode 100644
index 000000000..3a9ed5364
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_sconv2d.cu
@@ -0,0 +1,73 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename X, typename Y>
+static void sconv2d_(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+
+    // input         [bS, iH, iW, iC]  (NHWC) or [bS, iC, iH, iW]  (NCHW)
+    // weightsDepth  [kH, kW, iC, mC], [mC, iC, kH, kW], [mC, kH, kW, iC]
+    // weightsPoint  [1, 1, iC*mC, oC], [oC, iC*mC, 1, 1], [oC, 1, 1, iC*mC]
+    // bias          [oC], oC = iC*mC if weightsPoint=nullptr
+    // output is     [bS, oH, oW, oC]  (NHWC) or [bS, oC, oH, oW]  (NCHW)
+
+    //  kH         filter(kernel) height
+    //  kW         filter(kernel) width
+    //  sH         strides height
+    //  sW         strides width
+    //  pH         paddings height
+    //  pW         paddings width
+    //  dH         dilations height
+    //  dW         dilations width
+    //  paddingMode 0-VALID, 1-SAME
+    //  isNCHW     1-NCHW,  0-NHWC
+
+    int bS, iC, iH, iW, mC, oC, oH, oW;                     // batch size, input channels, input height/width, channels multiplier, output channels, output height/width
+    int indIOioC, indIiH, indWmC, indWiC, indWkH, indOoH;   // corresponding indexes
+    ConvolutionUtils::getSizesAndIndexesConv2d(isNCHW, wFormat, *input, *output, bS, iC, iH, iW, oC, oH, oW, indIOioC, indIiH, indWiC, indWmC, indWkH, indOoH);
+    mC = weightsDepth->sizeAt(indWmC);                      // channels multiplier
+
+    NDArray* outputDepth = output;
+    if(weightsPoint)                        // if pointwise convolution is expected
+        outputDepth = new NDArray(output->ordering(), !isNCHW ? std::vector<Nd4jLong>({bS, oH, oW, iC*mC}) : std::vector<Nd4jLong>({bS, iC*mC, oH, oW}), input->dataType(), input->getContext());
+
+    // ----- perform depthwise convolution (if weightsPoint is absent then oC = iC*mC) ----- //
+    ConvolutionUtils::depthwiseConv2d(block, input, weightsDepth, weightsPoint ? nullptr : bias, outputDepth, kH,kW, sH,sW, pH,pW, dH,dW, paddingMode, isNCHW, wFormat);
+
+    // ----- perform pointwise convolution (oH = iH, oW = iW) ----- //
+    if (weightsPoint) {
+        ConvolutionUtils::conv2d(block, outputDepth, weightsPoint, bias, output, 1,1, 1,1, 0,0, 1,1, paddingMode, isNCHW, wFormat);             // in this case oH=iH, oW=iW
+        delete outputDepth;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::sconv2d(sd::graph::Context& block, const NDArray* input, const NDArray* weightsDepth, const NDArray* weightsPoint, const NDArray* bias,  NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int paddingMode, const int isNCHW, const int wFormat) {
+    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), sconv2d_, (block, input, weightsDepth, weightsPoint, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, paddingMode, isNCHW, wFormat), FLOAT_TYPES);
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling2d.cu
new file mode 100644
index 000000000..be9fab0be
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling2d.cu
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int factorH, const int factorW, const bool isNCHW) {
+
+    // x has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
+    // z has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
+
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
+
+    __shared__ int rank, dimIH;
+    __shared__ Nd4jLong zLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        dimIH  = isNCHW ? 2 : 1;
+        zLen   = shape::length(zShapeInfo);
+        rank   = 4;
+    }
+    __syncthreads();
+
+    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(zInd >= zLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(zInd, zShapeInfo, coords);
+
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
+
+    coords[dimIH]     /= factorH;
+    coords[dimIH + 1] /= factorW;
+
+    const auto xOffset = shape::getOffset(xShapeInfo, coords);
+
+    z[zOffset] = x[xOffset];
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling2dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                     const void* vx, const Nd4jLong* xShapeInfo,
+                                           void* vz, const Nd4jLong* zShapeInfo,
+                                     const int factorH, const int factorW, const bool isNCHW) {
+
+    upsampling2dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, factorH, factorW, isNCHW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::upsampling2d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorH, const int factorW, const bool isNCHW) {
+
+    PointersManager manager(block.launchContext(), "upsampling2d");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+
+    NDArray::prepareSpecialUse({&output}, {&input});
+    BUILD_SINGLE_SELECTOR(input.dataType(), upsampling2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), factorH, factorW, isNCHW), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&output}, {&input});
+
+    manager.synchronize();
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling2dBP.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling2dBP.cu
new file mode 100644
index 000000000..ce393d279
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling2dBP.cu
@@ -0,0 +1,103 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool isNCHW) {
+
+    // x (gradO) has shape [bS, iC, factorH*iH, factorW*iW ] (NCHW) or [bS, factorH*iH, factorW*iW, iC] (NHWC)
+    // z (gradI) has shape [bS, iC, iH, iW] (NCHW) or [bS, iH, iW, iC] (NHWC)
+
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
+
+    __shared__ int rank, dimIH;
+    __shared__ uint factorH, factorW;
+    __shared__ Nd4jLong zLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        dimIH = isNCHW ? 2 : 1;
+        zLen  = shape::length(zShapeInfo);
+        rank  = 4;
+
+        factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1];
+        factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2];
+    }
+    __syncthreads();
+
+    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(zInd >= zLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(zInd, zShapeInfo, coords);
+
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
+
+    z[zOffset] = 0;
+
+    const Nd4jLong zCoord2 = coords[dimIH]     * factorH;
+    const Nd4jLong zCoord3 = coords[dimIH + 1] * factorW;
+
+    for(coords[dimIH] = zCoord2; coords[dimIH] < zCoord2 + factorH; ++coords[dimIH])
+        for(coords[dimIH + 1] = zCoord3; coords[dimIH + 1] < zCoord3 + factorW; ++coords[dimIH + 1])
+            z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling2dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                       const void* vx, const Nd4jLong* xShapeInfo,
+                                             void* vz, const Nd4jLong* zShapeInfo,
+                                       const bool isNCHW) {
+
+    upsampling2dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, isNCHW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::upsampling2dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCHW) {
+
+    PointersManager manager(block.launchContext(), "upsampling2d_bp");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = gradI.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+
+    NDArray::prepareSpecialUse({&gradI}, {&gradO});
+    BUILD_SINGLE_SELECTOR(gradI.dataType(), upsampling2dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), isNCHW), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&gradI}, {&gradO});
+
+    manager.synchronize();
+}
+
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling3d.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling3d.cu
new file mode 100644
index 000000000..6f15a27d6
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling3d.cu
@@ -0,0 +1,98 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+
+    // x has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
+    // z has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
+
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
+
+    __shared__ int rank, dimID;
+    __shared__ Nd4jLong zLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        dimID = isNCDHW ? 2 : 1;
+        zLen  = shape::length(zShapeInfo);
+        rank  = 5;
+    }
+    __syncthreads();
+
+    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(zInd >= zLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(zInd, zShapeInfo, coords);
+
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
+
+    coords[dimID]     /= factorD;
+    coords[dimID + 1] /= factorH;
+    coords[dimID + 2] /= factorW;
+
+    const auto xOffset = shape::getOffset(xShapeInfo, coords);
+
+    z[zOffset] = x[xOffset];
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling3dCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                     const void* vx, const Nd4jLong* xShapeInfo,
+                                           void* vz, const Nd4jLong* zShapeInfo,
+                                     const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+
+    upsampling3dCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, factorD, factorH, factorW, isNCDHW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::upsampling3d(sd::graph::Context& block, const NDArray& input, NDArray& output, const int factorD, const int factorH, const int factorW, const bool isNCDHW) {
+
+    PointersManager manager(block.launchContext(), "upsampling3d");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+
+    NDArray::prepareSpecialUse({&output}, {&input});
+    BUILD_SINGLE_SELECTOR(input.dataType(), upsampling3dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), factorD, factorH, factorW, isNCDHW), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&output}, {&input});
+
+    manager.synchronize();
+}
+
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling3dBP.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling3dBP.cu
new file mode 100644
index 000000000..f9eb56bec
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_upsampling3dBP.cu
@@ -0,0 +1,107 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+__global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const bool isNCDHW) {
+
+    // x (gradO) has shape [bS, iC, iD, iH, iW] (NCDHW) or [bS, iD, iH, iW, iC] (NDHWC)
+    // z (gradI) has shape [bS, iC, factorD*iD, factorH*iH, factorW*iW ] (NCDHW) or [bS, factorD*iD, factorH*iH, factorW*iW, iC] (NDHWC)
+
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
+
+    __shared__ int rank, dimID;
+    __shared__ uint factorD, factorH, factorW;
+    __shared__ Nd4jLong zLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        dimID = isNCDHW ? 2 : 1;
+        zLen  = shape::length(zShapeInfo);
+        rank  = 5;
+
+        factorD = xShapeInfo[dimID + 1] / zShapeInfo[dimID + 1];
+        factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2];
+        factorW = xShapeInfo[dimID + 3] / zShapeInfo[dimID + 3];
+    }
+    __syncthreads();
+
+    const auto zInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(zInd >= zLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    shape::index2coords(zInd, zShapeInfo, coords);
+
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
+
+    z[zOffset] = 0;
+
+    const Nd4jLong zCoord2 = coords[dimID]     * factorD;
+    const Nd4jLong zCoord3 = coords[dimID + 1] * factorH;
+    const Nd4jLong zCoord4 = coords[dimID + 2] * factorW;
+
+    for(coords[dimID] = zCoord2; coords[dimID] < zCoord2 + factorD; ++coords[dimID])
+        for(coords[dimID + 1] = zCoord3; coords[dimID + 1] < zCoord3 + factorH; ++coords[dimID + 1])
+            for(coords[dimID + 2] = zCoord4; coords[dimID + 2] < zCoord4 + factorW; ++coords[dimID + 2])
+                z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void upsampling3dBPCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                       const void* vx, const Nd4jLong* xShapeInfo,
+                                             void* vz, const Nd4jLong* zShapeInfo,
+                                       const bool isNCDHW) {
+
+    upsampling3dBPCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vz, zShapeInfo, isNCDHW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::upsampling3dBP(sd::graph::Context& block, const NDArray& gradO, NDArray& gradI, const bool isNCDHW) {
+
+    PointersManager manager(block.launchContext(), "upsampling3d_bp");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = gradI.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+
+    NDArray::prepareSpecialUse({&gradI}, {&gradO});
+    BUILD_SINGLE_SELECTOR(gradI.dataType(), upsampling3dBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), gradO.getSpecialBuffer(), gradO.getSpecialShapeInfo(), gradI.specialBuffer(), gradI.specialShapeInfo(), isNCDHW), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&gradI}, {&gradO});
+
+    manager.synchronize();
+}
+
+
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions_vol2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_vol2col.cu
new file mode 100644
index 000000000..ebe0ec26e
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions_vol2col.cu
@@ -0,0 +1,111 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ * Copyright (c) 2019 Konduit K.K.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <ops/declarable/helpers/convolutions.h>
+#include <helpers/PointersManager.h>
+
+namespace sd {
+namespace ops  {
+
+//////////////////////////////////////////////////////////////////////////
+// vol [bS, iC, iD, iH, iW] is convoluted to col [bS, iC, kD, kH, kW, oD, oH, oW]
+template <typename T>
+static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeInfo, void* columns, const Nd4jLong* colShapeInfo,  const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+    const T* vol = reinterpret_cast<const T*>(volume);
+          T* col = reinterpret_cast<T*>(columns);
+
+    __shared__ int colRank, volRank;
+    __shared__ Nd4jLong colLen, iD, iH, iW, *sharedMem;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        volRank = 5;
+        colRank = 8;
+
+        colLen = shape::length(colShapeInfo);
+
+        iD = volShapeInfo[3];
+        iH = volShapeInfo[4];
+        iW = volShapeInfo[5];
+    }
+    __syncthreads();
+
+    const auto colInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(colInd >= colLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * colRank;
+
+    shape::index2coords(colInd, colShapeInfo, coords);
+
+    // const auto colW = coords[7];
+    // const auto colH = coords[6];
+    // const auto colD = coords[5];
+    // const auto kCol = coords[4];
+    // const auto kRow = coords[3];
+    // const auto kDep = coords[2];
+    // const auto c    = coords[1];
+    // const auto b    = coords[0];
+
+    const auto colOffset = shape::getOffset(colShapeInfo, coords);
+
+    coords[2] = -pD + coords[2] * dD + coords[5] * sD;     // const auto volDep = (-pD + kDep * dD) + colD * sD;
+    coords[3] = -pH + coords[3] * dH + coords[6] * sH;     // const auto volRow = (-pH + kRow * dH) + colH * sH;
+    coords[4] = -pW + coords[4] * dW + coords[7] * sW;     // const auto volCol = (-pW + kCol * dW) + colW * sW;
+
+    if (static_cast<unsigned>(coords[2]) >= static_cast<unsigned>(iD) || static_cast<unsigned>(coords[3]) >= static_cast<unsigned>(iH) || static_cast<unsigned>(coords[4]) >= static_cast<unsigned>(iW))
+        col[colOffset] = static_cast<T>(0.);
+    else
+        col[colOffset] = vol[shape::getOffset(volShapeInfo, coords)];
+}
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void vol2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                const void* volume, const Nd4jLong* volShapeInfo,
+                                      void* columns, const Nd4jLong* colShapeInfo,
+                                const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+    vol2colCuda<T><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(volume, volShapeInfo, columns, colShapeInfo,  sD, sH, sW, pD, pH, pW, dD, dH, dW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void ConvolutionUtils::vol2col(sd::graph::Context& block, const NDArray& vol, NDArray& col, const int sD, const int sH, const int sW, const int pD, const int pH, const int pW, const int dD, const int dH, const int dW) {
+
+    PointersManager manager(block.launchContext(), "vol2col");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 4;
+    const int blocksPerGrid = (col.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = col.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+
+    NDArray::prepareSpecialUse({&col}, {&vol});
+    BUILD_SINGLE_SELECTOR(vol.dataType(), vol2colCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), vol.getSpecialBuffer(), vol.getSpecialShapeInfo(), col.specialBuffer(), col.specialShapeInfo(), sD, sH, sW, pD, pH, pW, dD, dH, dW), FLOAT_TYPES);
+    NDArray::registerSpecialUse({&col}, {&vol});
+
+    manager.synchronize();
+}
+
+}
+}
\ No newline at end of file