[WIP] stb/bts nd (#144)

* - start working on space_to_batch_nd Signed-off-by: Yurii <yurii@skymind.io> * - provide cpu helper for space_to_batch_nd op Signed-off-by: Yurii <yurii@skymind.io> * few typos fixed Signed-off-by: raver119 <raver119@gmail.com> * - add tests for space_to_batch and correct bugs Signed-off-by: Yurii <yurii@skymind.io> * - write cuda kernel for space_to_batch op Signed-off-by: Yurii <yurii@skymind.io> * - add order argument to shape::index2coords method in convolution cuda ops Signed-off-by: Yurii <yurii@skymind.io> * - restore some previous code Signed-off-by: Yurii <yurii@skymind.io> * old col2im kernel activated Signed-off-by: raver119 <raver119@gmail.com> * - change coords calculation in col2im kernel Signed-off-by: Yurii <yurii@skymind.io> * - restore old col2im kernel Signed-off-by: Yurii <yurii@skymind.io> * - add custom op for batch_to_space Signed-off-by: Yurii <yurii@skymind.io> * - provide cpu version for batch_to_space_nd op Signed-off-by: Yurii <yurii@skymind.io> * - provide cuda kernel for batch_to_space_nd op Signed-off-by: Yurii <yurii@skymind.io>
2019-08-21 21:11:46 +03:00 · 2019-08-21 21:11:46 +03:00 · eea3062ccf
commit eea3062ccf
parent e604ffe0d2
12 changed files with 943 additions and 41 deletions
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@ -599,6 +599,7 @@ namespace nd4j {
        /**
        *  apply scalar operation to array
        *  extraParams - extra parameters for operation
+        *  returns scalar array
        */
        NDArray reduceNumber(nd4j::reduce::FloatOps ops, void *extraParams = nullptr) const;
        NDArray reduceNumber(nd4j::reduce::SameOps  ops, void *extraParams = nullptr) const;
--- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
@ -30,7 +30,6 @@ limitations under the License.
 ==============================================================================*/

 //
-// @author raver119@gmail.com, created on 19.01.18.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //

@ -63,9 +62,8 @@ CUSTOM_OP_IMPL(batch_to_space, 2, 1, false, 0, 1) {
    REQUIRE_TRUE(rank == 4, 0, "BatchToSpace: rank of input array must be equal 4, but got %i instead", rank);
    REQUIRE_TRUE(dim0 % (blockSize * blockSize) == 0, 0, "BatchToSpace: first dimension of input array must be divisible by blockSize * blockSize (that is by %i), but got first dimension equal to %i", blockSize * blockSize, dim0);

-    const std::string expectedCropShape = "[2, 2]";
-    const std::string actualCropShape = ShapeUtils::shapeAsString(crop);
-    REQUIRE_TRUE(actualCropShape == expectedCropShape, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", actualCropShape.c_str());
+    if(crop->sizeAt(0) != 2 || crop->sizeAt(1) != 2)
+        REQUIRE_TRUE(false, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(crop).c_str());

    const uint cropBottom = crop->e<uint>(0,0);
    const uint cropTop    = crop->e<uint>(0,1);
@ -104,9 +102,8 @@ DECLARE_SHAPE_FN(batch_to_space) {
    REQUIRE_TRUE(rank == 4, 0, "BatchToSpace: rank of input array must be equal 4, but got %i instead", rank);
    REQUIRE_TRUE(dim0 % (blockSize * blockSize) == 0, 0, "BatchToSpace: first dimension of input array must be divisible by blockSize * blockSize (that is by %i), but got first dimension equal to %i", blockSize * blockSize, dim0);

-    const std::string expectedCropShape = "[2, 2]";
-    const std::string actualCropShape = ShapeUtils::shapeAsString(cropShapeInfo);
-    REQUIRE_TRUE(actualCropShape == expectedCropShape, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", actualCropShape.c_str());
+    if(cropShapeInfo[1] != 2 || cropShapeInfo[2] != 2)
+        REQUIRE_TRUE(false, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(cropShapeInfo).c_str());

    const uint cropBottom = INPUT_VARIABLE(1)->e<Nd4jLong>(0,0);
    const uint cropTop    = INPUT_VARIABLE(1)->e<Nd4jLong>(0,1);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
@ -0,0 +1,131 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <op_boilerplate.h>
+#if NOT_EXCLUDED(OP_batch_to_space_nd)
+
+#include <ops/declarable/headers/parity_ops.h>
+#include <ops/declarable/helpers/s_t_b.h>
+
+namespace nd4j {
+namespace ops  {
+
+
+CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) {
+
+    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
+    // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
+
+    auto input      = INPUT_VARIABLE(0);
+    auto blockShape = INPUT_VARIABLE(1);
+    auto crop       = INPUT_VARIABLE(2);
+
+    auto output = OUTPUT_VARIABLE(0);
+
+    REQUIRE_TRUE(blockShape->rankOf() == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShape->rankOf());
+
+    const uint numOfSpatialDims = blockShape->sizeAt(0);
+
+    const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
+    REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0));
+
+    // FIXME - should we use this time-consuming validation ?
+    for (uint i = 0; i < numOfSpatialDims; ++i) {
+        const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
+        REQUIRE_TRUE(blockSize >= 2, 0, "BatchToSpaceND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
+    }
+
+    if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) {
+        const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
+        REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(crop).c_str());
+    }
+
+    // FIXME - should we use this time-consuming validation ?
+    for (uint i = 0; i < numOfSpatialDims; ++i) {
+        const auto cropLeft      = crop->e<uint>(i,0);
+        const auto cropRight     = crop->e<uint>(i,1);
+        const auto outSpatialDim = input->sizeAt(i + 1) * blockShape->e<Nd4jLong>(i) - cropLeft - cropRight;
+        REQUIRE_TRUE(outSpatialDim >= 0, 0, "BatchToSpaceND: crop left/right values are too big and cause negative output spatial dimension/dimensions !");
+    }
+
+    helpers::batchToSpaceND(block.launchContext(), *input, *blockShape, *crop, *output);
+
+    return Status::OK();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(batch_to_space_nd) {
+
+    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                     ->setAllowedInputTypes(1, {ALL_INTS})
+                     ->setAllowedInputTypes(2, {ALL_INTS})
+                     ->setSameMode(true);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(batch_to_space_nd) {
+
+    auto inputShapeInfo = inputShape->at(0);
+    auto blockShapeInfo = inputShape->at(1);
+    auto cropShapeInfo  = inputShape->at(2);
+
+    REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]);
+
+    const auto product = INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
+    REQUIRE_TRUE(inputShapeInfo[1] % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, inputShapeInfo[1]);
+
+    const auto numOfSpatialDims = blockShapeInfo[1];
+
+    if(cropShapeInfo[1] != numOfSpatialDims || cropShapeInfo[2] != 2) {
+        const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
+        REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(cropShapeInfo).c_str());
+    }
+
+
+    std::vector<Nd4jLong> outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]);
+
+    outShape[0] /= product;
+
+    for (uint i = 0; i < numOfSpatialDims; ++i)
+        outShape[i + 1] = outShape[i + 1] * INPUT_VARIABLE(1)->e<Nd4jLong>(i) - INPUT_VARIABLE(2)->e<uint>(i,0) - INPUT_VARIABLE(2)->e<uint>(i,1);
+
+    return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inputShapeInfo), 'c', outShape));
+}
+
+
+}
+}
+
+#endif
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 //
-// @author raver119@gmail.com, created on 19.01.18.
 // @author Yurii Shyrma (iuriish@yahoo.com)
+// @author raver119@gmail.com
 //

 #include <op_boilerplate.h>
@ -39,12 +39,11 @@ CUSTOM_OP_IMPL(space_to_batch, 2, 1, false, 0, 1) {
    const uint blockSize = INT_ARG(0);
    REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatch: integer parameter block_size must be >= 2, but got %i instead", blockSize);

-    const int rank = input->rankOf();
-    REQUIRE_TRUE(rank == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", rank);
+    REQUIRE_TRUE(input->rankOf() == 4,  0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead",  input->rankOf());
+    REQUIRE_TRUE(output->rankOf() == 4, 0, "SpaceToBatch: rank of output array must be equal 4, but got %i instead", output->rankOf());

-    const std::string expectedpaddingShape = "[2, 2]";
-    const std::string actualpaddingShape = ShapeUtils::shapeAsString(padding);
-    REQUIRE_TRUE(actualpaddingShape == expectedpaddingShape, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", actualpaddingShape.c_str());
+    if(padding->sizeAt(0) != 2 || padding->sizeAt(1) != 2)
+        REQUIRE_TRUE(false, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(padding).c_str());

    const uint padBottom = padding->e<uint>(0,0);
    const uint padTop    = padding->e<uint>(0,1);
@ -78,9 +77,8 @@ DECLARE_SHAPE_FN(space_to_batch) {
    const int rank = inputShapeInfo[0];
    REQUIRE_TRUE(rank == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", rank);

-    const std::string expectedpaddingShape = "[2, 2]";
-    const std::string actualpaddingShape = ShapeUtils::shapeAsString(paddingShapeInfo);
-    REQUIRE_TRUE(actualpaddingShape == expectedpaddingShape, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", actualpaddingShape.c_str());
+    if(paddingShapeInfo[1] != 2 || paddingShapeInfo[1] != 2)
+        REQUIRE_TRUE(false, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(paddingShapeInfo).c_str());

    const uint padBottom = INPUT_VARIABLE(1)->e<Nd4jLong>(0,0);
    const uint padTop    = INPUT_VARIABLE(1)->e<Nd4jLong>(0,1);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
@ -0,0 +1,108 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+#include <op_boilerplate.h>
+#if NOT_EXCLUDED(OP_space_to_batch_nd)
+
+#include <ops/declarable/headers/parity_ops.h>
+#include <ops/declarable/helpers/s_t_b.h>
+
+namespace nd4j {
+namespace ops  {
+
+
+CUSTOM_OP_IMPL(space_to_batch_nd, 3, 1, false, 0, 0) {
+
+    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
+    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockSize[0], (iW + padLeft + padRight)/blockSize[1], iC]
+
+    auto input      = INPUT_VARIABLE(0);
+    auto blockShape = INPUT_VARIABLE(1);
+    auto padding    = INPUT_VARIABLE(2);
+
+    auto output = OUTPUT_VARIABLE(0);
+
+    REQUIRE_TRUE(blockShape->rankOf() == 1, 0, "SpaceToBatchND: rank of blockShape array must be equal to one, but got %i instead !", blockShape->rankOf());
+
+    const uint numOfSpatialDims = blockShape->sizeAt(0);
+
+    REQUIRE_TRUE(input->rankOf() == output->rankOf(), 0, "SpaceToBatchND: rank of input and output array must be the same, but got %i and %i correspondingly !", input->rankOf(), output->rankOf());
+
+    // FIXME - should we use this time-consuming validation ?
+    for (uint i = 0; i < numOfSpatialDims; ++i) {
+        const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
+        REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatchND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
+    }
+
+    if(padding->sizeAt(0) != numOfSpatialDims || padding->sizeAt(1) != 2) {
+        const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
+        REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(padding).c_str());
+    }
+
+    // FIXME - should we use this time-consuming validation ?
+    for (uint i = 0; i < numOfSpatialDims; ++i) {
+        const uint padLeft       = padding->e<uint>(i,0);
+        const uint padRight      = padding->e<uint>(i,1);
+        const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
+        REQUIRE_TRUE((input->sizeAt(i + 1) + padLeft + padRight) % blockSize == 0, 0, "SpaceToBatchND: after padding, spatial dimensions of input array must be divisible by blockSize !");
+    }
+
+    helpers::spaceToBatchND(block.launchContext(), *input, *blockShape, *padding, *output);
+
+    return Status::OK();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+DECLARE_TYPES(space_to_batch_nd) {
+
+    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
+                     ->setAllowedInputTypes(1, {ALL_INTS})
+                     ->setAllowedInputTypes(2, {ALL_INTS})
+                     ->setSameMode(true);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(space_to_batch_nd) {
+
+    auto inputShapeInfo   = inputShape->at(0);
+    auto blockShapeInfo   = inputShape->at(1);
+    auto paddingShapeInfo = inputShape->at(2);
+
+    REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "SpaceToBatchND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]);
+
+    const uint numOfSpatialDims = blockShapeInfo[1];
+
+    if(paddingShapeInfo[1] != numOfSpatialDims || paddingShapeInfo[2] != 2) {
+        const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
+        REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(paddingShapeInfo).c_str());
+    }
+
+    std::vector<Nd4jLong> outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]);
+
+    outShape[0] *= INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
+
+    for (uint i = 0; i < numOfSpatialDims; ++i)
+        outShape[i + 1] = (outShape[i + 1] + INPUT_VARIABLE(2)->e<uint>(i,0) + INPUT_VARIABLE(2)->e<uint>(i,1)) / INPUT_VARIABLE(1)->e<Nd4jLong>(i);
+
+    return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inputShapeInfo), 'c', outShape));
+}
+
+}
+}
+
+#endif
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@ -617,6 +617,10 @@ namespace nd4j {
        DECLARE_CUSTOM_OP(space_to_batch, 2, 1, false, 0, 1);
        #endif

+        #if NOT_EXCLUDED(OP_space_to_batch_nd)
+        DECLARE_CUSTOM_OP(space_to_batch_nd, 3, 1, false, 0, 0);
+        #endif
+
        /**
         *
         *
@ -624,6 +628,9 @@ namespace nd4j {
        #if NOT_EXCLUDED(OP_batch_to_space)
        DECLARE_CUSTOM_OP(batch_to_space, 2, 1, false, 0, 1);
        #endif
+        #if NOT_EXCLUDED(OP_batch_to_space_nd)
+        DECLARE_CUSTOM_OP(batch_to_space_nd, 3, 1, false, 0, 0);
+        #endif

        /**
         * top_k operation returns a vector of k top values for
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@ -15,8 +15,8 @@
 ******************************************************************************/

 //
-// @author raver119@gmail.com, created on 19.01.18.
 // @author Yurii Shyrma (iuriish@yahoo.com)
+// @author raver119@gmail.com
 //

 #include <ops/declarable/helpers/s_t_b.h>
@ -90,13 +90,107 @@ void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    }
 }

+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims) {
+
+    // input [bS, H * blockShape[0], W * blockShape[1], iC]
+    // output [bS, H * blockShape[0] - cropBottom - cropTop, W * blockShape[1] - cropLeft - cropRight, iC]
+
+    // if (cropTop = cropBottom = cropRight = cropLeft = 0) shapes are the same
+    // else:
+    // oH -> [cropBottom, iH - cropTop]
+    // oW -> [cropLeft,   iH - cropRight]
+    // xLen >= zLen
+
+    const T* x = input.bufferAsT<T>();
+          T* z = output.bufferAsT<T>();
+
+    const int rank = input.rankOf();
+    const Nd4jLong zLen = output.lengthOf();
+
+    std::vector<Nd4jLong> coords(rank);
+
+    // loop through input array
+    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
+
+    for (Nd4jLong i = 0; i < zLen; ++i) {
+
+        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
+
+        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
+
+        // evaluate spatial coordinates for x
+        for(uint j = 1; j <= numOfSpatialDims; ++j)
+            coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
+
+        z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
+    }
+}
+
+BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
+
+//////////////////////////////////////////////////////////////////////////
+void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
+
+    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
+    // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
+
+    const uint rank = input.rankOf();
+    const uint numOfSpatialDims = blockShape.sizeAt(0);
+
+    //*** construct reshaping std::vector for first reshape of input array ***//
+
+    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
+
+    int i;
+    for(i = 0; i < numOfSpatialDims; ++i)
+        temp[i] = blockShape.e<Nd4jLong>(i);
+    temp[i++] = output.sizeAt(0);
+    for(int j = 1; j < rank; ++i, ++j)
+        temp[i] = input.sizeAt(j);
+
+    NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
+
+    //*** construct permuting std::vector for permutation of input array ***//
+
+    temp[0] = numOfSpatialDims;
+
+    for(i = 1; i <= numOfSpatialDims; ++i) {
+        temp[2*i - 1] = numOfSpatialDims + i;
+        temp[2*i]     = i - 1;
+    }
+    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
+        temp[i] = i;
+
+    inputRearranged0.permutei(temp);
+
+
+    if(input.lengthOf() == output.lengthOf()) {
+        output.assign(inputRearranged0);
+    }
+    else {
+        //*** construct reshaping std::vector for second reshape of input array ***//
+
+        temp.resize(rank);
+
+        temp[0] = output.sizeAt(0);
+
+        for(i = 1; i < rank; ++i)
+            temp[i] = (i <= numOfSpatialDims) ? input.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : input.sizeAt(i);
+
+        NDArray inputRearranged1 = inputRearranged0.reshape(input.ordering(), temp);
+
+        BUILD_SINGLE_SELECTOR(input.dataType(), batchToSpaceND_, (inputRearranged1, crop, output, numOfSpatialDims), LIBND4J_TYPES);
+    }
+}

 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight) {

    // input [bS, H * blockSize - padBottom - padTop, W * blockSize - padLeft - padRight, iC]
-    // output [bs, H * blockSize, W * blockSize, iC]
+    // output [bS, H * blockSize, W * blockSize, iC]

    // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same
    // else:
@ -145,26 +239,153 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o

    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC]

-    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)});
+    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)});
    outputRearranged0.permutei({2, 3,0, 4,1, 5});

    if(input.lengthOf() == output.lengthOf()) {
        outputRearranged0.assign(input);
    }
    else {
-        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)});
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)});
        BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatch_, (input, outputRearranged1, padBottom, padTop, padLeft, padRight), LIBND4J_TYPES);

        if(output.getBuffer() != outputRearranged1.getBuffer())
            outputRearranged0.assign(outputRearranged1);
    }
-
 }





+
+
+
+
+
+
+
+
+
+
+
+
+
+
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims) {
+
+    // 4D example
+    // input [bS, H * blockShape[0] - padBottom - padTop, W * blockShape[1] - padLeft - padRight, iC]
+    // output [bS, H * blockShape[0], W * blockShape[1], iC]
+
+    // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same
+    // else:
+    // iH -> [padBottom, oH - padTop]
+    // iW -> [padLeft,   oW - padRight]
+    // zLen > xLen
+
+    const T* x = input.bufferAsT<T>();
+          T* z = output.bufferAsT<T>();
+
+    const int rank = input.rankOf();
+    const Nd4jLong zLen = output.lengthOf();
+
+    std::vector<Nd4jLong> coords(rank);
+
+    // loop through output array
+    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
+    for (Nd4jLong i = 0; i < zLen; ++i) {
+
+        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
+
+        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
+
+        bool within = true;
+
+        for(uint j = 1; j <= numOfSpatialDims; ++j) {
+
+            const auto padLeft  = padding.e<uint>(j - 1, 0);
+            const auto padRight = padding.e<uint>(j - 1, 1);
+
+            within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
+
+            if(!within)
+                break;
+
+            coords[j] -= padLeft;       // get coordinates for x
+        }
+
+        if(within)
+            z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
+        else
+            z[zOffset] = 0.f;
+    }
+}
+
+BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
+
+//////////////////////////////////////////////////////////////////////////
+void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
+
+    // 4D example with two spatial dimensions
+    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC]
+
+    const uint rank = input.rankOf();
+
+    const uint numOfSpatialDims = blockShape.sizeAt(0);
+
+    //*** construct reshaping std::vector for first reshape of output array ***//
+    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
+
+    int i;
+    for(i = 0; i < numOfSpatialDims; ++i)
+        temp[i] = blockShape.e<Nd4jLong>(i);
+    temp[i++] = input.sizeAt(0);
+    for(int j = 1; j < rank; ++i, ++j)
+        temp[i] = output.sizeAt(j);
+
+    NDArray outputRearranged0 = output.reshape(output.ordering(), temp);
+
+    //*** construct permuting std::vector for permutation of output array ***//
+
+    temp[0] = numOfSpatialDims;
+
+    for(i = 1; i <= numOfSpatialDims; ++i) {
+        temp[2*i - 1] = numOfSpatialDims + i;
+        temp[2*i]     = i - 1;
+    }
+    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
+        temp[i] = i;
+
+    outputRearranged0.permutei(temp);
+
+    // ****** //
+
+    if(input.lengthOf() == output.lengthOf()) {
+        outputRearranged0.assign(input);
+    }
+    else {
+
+        //*** construct reshaping std::vector for second reshape of output array ***//
+        temp.resize(rank);
+
+        temp[0] = input.sizeAt(0);
+
+        for(i = 1; i < rank; ++i)
+            temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : output.sizeAt(i);
+
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp);
+
+        BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatchND_, (input, padding, outputRearranged1, numOfSpatialDims), LIBND4J_TYPES);
+
+        if(output.getBuffer() != outputRearranged1.getBuffer())
+            outputRearranged0.assign(outputRearranged1);
+    }
+}
+
+
 /*
    template <int N, bool B2S>
    struct SpaceToBatchHelper {
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
@ -135,10 +135,10 @@ __global__ static void col2imCuda2(const void *columns, void *image, const Nd4jL

          for (int i = (blockDim.x * blockIdx.x) + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
              T val = 0;
+
              int w_im = i % iW + pW;
              int h_im = (i / iW) % iH + pH;
              int c_im = i / (iW * iH);
-
              int b = c_im / iC;
              int c = c_im % iC;

--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@ -39,8 +39,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
          T* col = reinterpret_cast<T*>(columns);

    __shared__ int colRank, volRank;
-    __shared__ Nd4jLong colLen, iD, iH, iW;
-    __shared__ Nd4jLong *sharedMem;
+    __shared__ Nd4jLong colLen, iD, iH, iW, *sharedMem;

    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@ -38,13 +38,13 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
    // else:
    // oH -> [cropBottom, iH - cropTop]
    // oW -> [cropLeft,   iH - cropRight]
-    // xLen > zLen
+    // xLen >= zLen

    const auto x = reinterpret_cast<const T*>(vx);
          auto z = reinterpret_cast<T*>(vz);

    __shared__ int rank;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+    __shared__ Nd4jLong zLen, *sharedMem;

    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -52,7 +52,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn

        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
-        totalThreads = gridDim.x * blockDim.x;
    }
    __syncthreads();

@ -116,6 +115,139 @@ void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    }
 }

+
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+__global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShapeInfo,
+                                          const void* vy, const Nd4jLong* yShapeInfo,
+                                                void* vz, const Nd4jLong* zShapeInfo,
+                                          const uint numOfSpatialDims) {
+
+    // 4D example, numOfSpatialDims = 2
+    // input [bS, H * blockShape[0], W * blockShape[1], iC]
+    // output [bS, H * blockShape[0] - cropBottom - cropTop, W * blockShape[1] - cropLeft - cropRight, iC]
+
+    // if (cropTop = cropBottom = cropRight = cropLeft = 0) shapes are the same
+    // else:
+    // oH -> [cropBottom, iH - cropTop]
+    // oW -> [cropLeft,   iH - cropRight]
+    // xLen >= zLen
+
+    const auto x = reinterpret_cast<const X*>(vx);
+    const auto y = reinterpret_cast<const Y*>(vy);
+          auto z = reinterpret_cast<X*>(vz);
+
+    __shared__ int rank;
+    __shared__ Nd4jLong zLen, *sharedMem;
+
+    if (threadIdx.x == 0) {
+
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        rank  = shape::rank(zShapeInfo);
+        zLen  = shape::length(zShapeInfo);
+    }
+
+    __syncthreads();
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
+
+        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+
+        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+
+        // evaluate spatial coordinates for x
+        for(uint j = 1; j <= numOfSpatialDims; ++j) {
+            const auto yOffset  = (j - 1) * yShapeInfo[3];  // yRank = 2, calculate offset manually
+            coords[j] += y[yOffset];                        // add crop left
+        }
+
+        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+
+        z[zOffset] = x[xOffset];
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X,typename Y>
+static void batchToSpaceNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims) {
+
+    batchToSpaceNDCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, numOfSpatialDims);
+}
+BUILD_DOUBLE_TEMPLATE(template void batchToSpaceNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
+
+//////////////////////////////////////////////////////////////////////////
+void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
+
+    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
+    // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
+
+    const uint rank = input.rankOf();
+    const uint numOfSpatialDims = blockShape.sizeAt(0);
+
+    //*** construct reshaping std::vector for first reshape of input array ***//
+
+    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
+
+    int i;
+    for(i = 0; i < numOfSpatialDims; ++i)
+        temp[i] = blockShape.e<Nd4jLong>(i);
+    temp[i++] = output.sizeAt(0);
+    for(int j = 1; j < rank; ++i, ++j)
+        temp[i] = input.sizeAt(j);
+
+    NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
+
+    //*** construct permuting std::vector for permutation of input array ***//
+
+    temp[0] = numOfSpatialDims;
+
+    for(i = 1; i <= numOfSpatialDims; ++i) {
+        temp[2*i - 1] = numOfSpatialDims + i;
+        temp[2*i]     = i - 1;
+    }
+    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
+        temp[i] = i;
+
+    inputRearranged0.permutei(temp);
+
+
+    if(input.lengthOf() == output.lengthOf()) {
+
+        output.assign(inputRearranged0);
+    }
+    else {
+        //*** construct reshaping std::vector for second reshape of input array ***//
+
+        temp.resize(rank);
+
+        temp[0] = output.sizeAt(0);
+
+        for(i = 1; i < rank; ++i)
+            temp[i] = (i <= numOfSpatialDims) ? input.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : input.sizeAt(i);
+
+        NDArray inputRearranged1 = inputRearranged0.reshape(input.ordering(), temp);
+
+        const int threadsPerBlock = MAX_NUM_THREADS / 4;
+        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+
+        PointersManager manager(context, "batchToSpaceND");
+
+        NDArray::prepareSpecialUse({&output}, {&inputRearranged1, &crop});
+        BUILD_DOUBLE_SELECTOR(input.dataType(), crop.dataType(), batchToSpaceNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), inputRearranged1.getSpecialBuffer(), inputRearranged1.getSpecialShapeInfo(), crop.getSpecialBuffer(), crop.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
+        NDArray::registerSpecialUse({&output}, {&inputRearranged1, &crop});
+
+        manager.synchronize();
+    }
+}
+
+
+
 ///////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight) {
@ -133,7 +265,7 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
          auto z = reinterpret_cast<T*>(vz);

    __shared__ int rank;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+    __shared__ Nd4jLong zLen, *sharedMem;

    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -141,7 +273,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn

        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
-        totalThreads = gridDim.x * blockDim.x;
    }
    __syncthreads();

@ -210,6 +341,153 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    }
 }

+///////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+__global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShapeInfo,
+                                          const void* vy, const Nd4jLong* yShapeInfo,
+                                                void* vz, const Nd4jLong* zShapeInfo,
+                                          const uint numOfSpatialDims) {
+
+    // x - input, y - padding, z - output
+
+    // 4D example
+    // input [bS, H * blockShape[0] - padBottom - padTop, W * blockShape[1] - padLeft - padRight, iC]
+    // output [bS, H * blockShape[0], W * blockShape[1], iC]
+
+    // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same
+    // else:
+    // iH -> [padBottom, oH - padTop]
+    // iW -> [padLeft,   oW - padRight]
+    // zLen > xLen
+
+    const auto x = reinterpret_cast<const X*>(vx);
+    const auto y = reinterpret_cast<const Y*>(vy);
+          auto z = reinterpret_cast<X*>(vz);
+
+    __shared__ int rank;    // xRank = zRank, yRank = 2;
+    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+
+    if (threadIdx.x == 0) {
+
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        rank  = shape::rank(zShapeInfo);
+        zLen  = shape::length(zShapeInfo);
+        totalThreads = gridDim.x * blockDim.x;
+    }
+
+    __syncthreads();
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < zLen; i += totalThreads) {
+
+        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+
+        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+
+        bool within = true;
+
+        for(uint j = 1; j <= numOfSpatialDims; ++j) {
+
+            // yRank = 2, calculate offset manually
+            const auto yOffset  = (j - 1) * yShapeInfo[3];
+            const auto padLeft  = y[yOffset];
+            const auto padRight = y[yOffset + yShapeInfo[4]];
+
+            within &= (coords[j] >= padLeft && coords[j] < shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo))[j] - padRight);
+
+            if(!within)
+                break;
+
+            coords[j] -= padLeft;       // get coordinates for x
+        }
+
+        if(within)
+            z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+        else
+            z[zOffset] = 0.f;
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+static void spaceToBatchNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims) {
+
+    spaceToBatchNDCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, numOfSpatialDims);
+}
+BUILD_DOUBLE_TEMPLATE(template void spaceToBatchNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
+
+//////////////////////////////////////////////////////////////////////////
+void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
+
+    // 4D example with two spatial dimensions
+    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC]
+
+    const uint rank = input.rankOf();
+
+    const uint numOfSpatialDims = blockShape.sizeAt(0);
+
+    //*** construct reshaping std::vector for first reshape of output array ***//
+    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
+
+    int i;
+    for(i = 0; i < numOfSpatialDims; ++i)
+        temp[i] = blockShape.e<Nd4jLong>(i);
+    temp[i++] = input.sizeAt(0);
+    for(int j = 1; j < rank; ++i, ++j)
+        temp[i] = output.sizeAt(j);
+
+    NDArray outputRearranged0 = output.reshape(output.ordering(), temp);
+
+    //*** construct permuting std::vector for permutation of output array ***//
+
+    temp[0] = numOfSpatialDims;
+
+    for(i = 1; i <= numOfSpatialDims; ++i) {
+        temp[2*i - 1] = numOfSpatialDims + i;
+        temp[2*i]     = i - 1;
+    }
+    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
+        temp[i] = i;
+
+    outputRearranged0.permutei(temp);
+
+    // ****** //
+
+    if(input.lengthOf() == output.lengthOf()) {
+        outputRearranged0.assign(input);
+    }
+    else {
+
+        //*** construct reshaping std::vector for second reshape of output array ***//
+        temp.resize(rank);
+
+        temp[0] = input.sizeAt(0);
+
+        for(i = 1; i < rank; ++i)
+            temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : output.sizeAt(i);
+
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp);
+
+        const int threadsPerBlock = MAX_NUM_THREADS / 4;
+        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+
+        PointersManager manager(context, "spaceToBatchND");
+
+        NDArray::prepareSpecialUse({&outputRearranged1}, {&input, &padding});
+        BUILD_DOUBLE_SELECTOR(input.dataType(), padding.dataType(), spaceToBatchNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), padding.getSpecialBuffer(), padding.getSpecialShapeInfo(), outputRearranged1.specialBuffer(), outputRearranged1.specialShapeInfo(), numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
+        NDArray::registerSpecialUse({&outputRearranged1}, {&input, &padding});
+
+        manager.synchronize();
+
+        if(output.getSpecialBuffer() != outputRearranged1.getSpecialBuffer())
+            outputRearranged0.assign(outputRearranged1);
+    }
+}
+

 /*
    template <int N, bool B2S>
--- a/libnd4j/include/ops/declarable/helpers/s_t_b.h
+++ b/libnd4j/include/ops/declarable/helpers/s_t_b.h
@ -31,6 +31,10 @@ namespace helpers {

    void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize);

+    void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output);
+
+    void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output);
+
 /*
    // this method MUST be platform-specific

--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@ -689,3 +689,161 @@ TEST_F(DeclarableOpsTests13, cyclic_rshift_bits_1) {
    delete result;
 }

+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, space_to_batch_nd_1) {
+
+    NDArray x('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32);    // three spatial dimensions
+    NDArray paddings('c', {3, 2}, {0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32);
+
+    NDArray exp('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32);
+
+    x.linspace(1);
+    exp.linspace(1);
+
+    nd4j::ops::space_to_batch_nd op;
+    auto result = op.execute({&x, &blockShape, &paddings}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, space_to_batch_nd_2) {
+
+    NDArray x('c', {2,  2,4,3,  1}, nd4j::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
+    NDArray paddings('c', {3, 2}, {0,0,  0,2,  2,1} , nd4j::DataType::INT32);
+
+    NDArray exp('c', {24, 1,3,2, 1}, { 0, 2, 0, 8, 0, 0, 0, 26, 0, 32, 0, 0, 0, 3, 0, 9, 0, 0, 0, 27, 0, 33, 0, 0, 1,
+                                        0, 7, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 29, 0, 35, 0, 0, 0, 6,
+                                        0, 12, 0, 0, 0, 30, 0, 36, 0, 0, 4, 0, 10, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 14,
+                                        0, 20, 0, 0, 0, 38, 0, 44, 0, 0, 0, 15, 0, 21, 0, 0, 0, 39, 0, 45, 0, 0, 13, 0,
+                                        19, 0, 0, 0, 37, 0, 43, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 41, 0, 47, 0, 0, 0, 18,
+                                        0, 24, 0, 0, 0, 42, 0, 48, 0, 0, 16, 0, 22, 0, 0, 0, 40, 0, 46, 0, 0, 0}, nd4j::DataType::FLOAT32);
+    x.linspace(1);
+
+    nd4j::ops::space_to_batch_nd op;
+    auto result = op.execute({&x, &blockShape, &paddings}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+    // z->printBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, space_to_batch_nd_3) {
+
+    NDArray x('c', {2,  2,4,3,  1}, nd4j::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
+    NDArray paddings('c', {3, 2}, {1,1,  0,2,  2,1} , nd4j::DataType::INT32);
+
+    NDArray exp('c', {24, 2,3,2, 1}, { 0, 0, 0, 0, 0, 0, 0, 14, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15,
+                                        0, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 45, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 19, 0, 0, 0, 0, 0, 0, 0,
+                                        0, 0, 37, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 47, 0, 0,
+                                        0, 0, 0, 0, 0, 0, 0, 18, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0, 48, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0,
+                                        22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 46, 0, 0, 0, 0, 2, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 32,
+                                        0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+                                        0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 0, 0, 0,
+                                        0, 0, 0, 29, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, 36, 0, 0,
+                                        0, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0}, nd4j::DataType::FLOAT32);
+    x.linspace(1);
+
+    nd4j::ops::space_to_batch_nd op;
+    auto result = op.execute({&x, &blockShape, &paddings}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+    // z->printBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batch_to_space_nd_1) {
+
+    NDArray x('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32);
+
+    NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32);    // three spatial dimensions
+    NDArray crop('c', {3, 2}, {0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32);
+
+    NDArray exp('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32);
+
+    x.linspace(1);
+    exp.linspace(1);
+
+    nd4j::ops::batch_to_space_nd op;
+    auto result = op.execute({&x, &blockShape, &crop}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batch_to_space_nd_2) {
+
+    NDArray x('c', {24, 1,3,2, 1}, nd4j::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
+    NDArray crop('c', {3, 2}, {0,0,  0,2,  2,1} , nd4j::DataType::INT32);
+
+    NDArray exp('c', {2,  2,4,3,  1}, {25, 2, 14, 61, 38, 50, 27, 4, 16, 63, 40, 52, 97, 74, 86, 133, 110, 122, 99, 76, 88, 135, 112, 124,
+                                      31, 8, 20, 67, 44, 56, 33, 10, 22, 69, 46, 58, 103, 80, 92, 139, 116, 128, 105, 82, 94, 141, 118, 130}, nd4j::DataType::FLOAT32);
+    x.linspace(1);
+
+    nd4j::ops::batch_to_space_nd op;
+    auto result = op.execute({&x, &blockShape, &crop}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+    // z->printBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, batch_to_space_nd_3) {
+
+    NDArray x('c', {24, 2,3,2, 1}, nd4j::DataType::FLOAT32);
+    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
+    NDArray crop('c', {3, 2}, {1,1,  0,2,  2,1} , nd4j::DataType::INT32);
+
+    NDArray exp('c', {2,  2,4,3,  1}, {193, 146, 170, 265, 218, 242, 195, 148, 172, 267, 220, 244, 55, 8, 32, 127, 80, 104, 57, 10, 34, 129, 82,
+                                    106, 205, 158, 182, 277, 230, 254, 207, 160, 184, 279, 232, 256, 67, 20, 44, 139, 92, 116, 69, 22, 46, 141, 94, 118}, nd4j::DataType::FLOAT32);
+    x.linspace(1);
+
+    nd4j::ops::batch_to_space_nd op;
+    auto result = op.execute({&x, &blockShape, &crop}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+    // z->printBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+
+