[WIP] stb/bts nd (#144)

* - start working on space_to_batch_nd Signed-off-by: Yurii <yurii@skymind.io> * - provide cpu helper for space_to_batch_nd op Signed-off-by: Yurii <yurii@skymind.io> * few typos fixed Signed-off-by: raver119 <raver119@gmail.com> * - add tests for space_to_batch and correct bugs Signed-off-by: Yurii <yurii@skymind.io> * - write cuda kernel for space_to_batch op Signed-off-by: Yurii <yurii@skymind.io> * - add order argument to shape::index2coords method in convolution cuda ops Signed-off-by: Yurii <yurii@skymind.io> * - restore some previous code Signed-off-by: Yurii <yurii@skymind.io> * old col2im kernel activated Signed-off-by: raver119 <raver119@gmail.com> * - change coords calculation in col2im kernel Signed-off-by: Yurii <yurii@skymind.io> * - restore old col2im kernel Signed-off-by: Yurii <yurii@skymind.io> * - add custom op for batch_to_space Signed-off-by: Yurii <yurii@skymind.io> * - provide cpu version for batch_to_space_nd op Signed-off-by: Yurii <yurii@skymind.io> * - provide cuda kernel for batch_to_space_nd op Signed-off-by: Yurii <yurii@skymind.io>
2019-08-21 21:11:46 +03:00 · 2019-08-21 21:11:46 +03:00 · eea3062ccf
commit eea3062ccf
parent e604ffe0d2
12 changed files with 943 additions and 41 deletions
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@ -599,6 +599,7 @@ namespace nd4j {
        /**
        *  apply scalar operation to array
        *  extraParams - extra parameters for operation
        *  returns scalar array
        */
        NDArray reduceNumber(nd4j::reduce::FloatOps ops, void *extraParams = nullptr) const;
        NDArray reduceNumber(nd4j::reduce::SameOps  ops, void *extraParams = nullptr) const;
--- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp
@ -30,7 +30,6 @@ limitations under the License.
 ==============================================================================*/
 //
 // @author raver119@gmail.com, created on 19.01.18.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
@ -63,9 +62,8 @@ CUSTOM_OP_IMPL(batch_to_space, 2, 1, false, 0, 1) {
    REQUIRE_TRUE(rank == 4, 0, "BatchToSpace: rank of input array must be equal 4, but got %i instead", rank);
    REQUIRE_TRUE(dim0 % (blockSize * blockSize) == 0, 0, "BatchToSpace: first dimension of input array must be divisible by blockSize * blockSize (that is by %i), but got first dimension equal to %i", blockSize * blockSize, dim0);
-    const std::string expectedCropShape = "[2, 2]";
+    if(crop->sizeAt(0) != 2 || crop->sizeAt(1) != 2)
-    const std::string actualCropShape = ShapeUtils::shapeAsString(crop);
+        REQUIRE_TRUE(false, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(crop).c_str());
    REQUIRE_TRUE(actualCropShape == expectedCropShape, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", actualCropShape.c_str());
    const uint cropBottom = crop->e<uint>(0,0);
    const uint cropTop    = crop->e<uint>(0,1);
@ -104,9 +102,8 @@ DECLARE_SHAPE_FN(batch_to_space) {
    REQUIRE_TRUE(rank == 4, 0, "BatchToSpace: rank of input array must be equal 4, but got %i instead", rank);
    REQUIRE_TRUE(dim0 % (blockSize * blockSize) == 0, 0, "BatchToSpace: first dimension of input array must be divisible by blockSize * blockSize (that is by %i), but got first dimension equal to %i", blockSize * blockSize, dim0);
-    const std::string expectedCropShape = "[2, 2]";
+    if(cropShapeInfo[1] != 2 || cropShapeInfo[2] != 2)
-    const std::string actualCropShape = ShapeUtils::shapeAsString(cropShapeInfo);
+        REQUIRE_TRUE(false, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(cropShapeInfo).c_str());
    REQUIRE_TRUE(actualCropShape == expectedCropShape, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", actualCropShape.c_str());
    const uint cropBottom = INPUT_VARIABLE(1)->e<Nd4jLong>(0,0);
    const uint cropTop    = INPUT_VARIABLE(1)->e<Nd4jLong>(0,1);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp
@ -0,0 +1,131 @@
 /*******************************************************************************
 * Copyright (c) 2015-2018 Skymind, Inc.
 *
 * This program and the accompanying materials are made available under the
 * terms of the Apache License, Version 2.0 which is available at
 * https://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 ******************************************************************************/
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 //
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include <op_boilerplate.h>
 #if NOT_EXCLUDED(OP_batch_to_space_nd)
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/s_t_b.h>
 namespace nd4j {
 namespace ops  {
 CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) {
    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
    // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
    auto input      = INPUT_VARIABLE(0);
    auto blockShape = INPUT_VARIABLE(1);
    auto crop       = INPUT_VARIABLE(2);
    auto output = OUTPUT_VARIABLE(0);
    REQUIRE_TRUE(blockShape->rankOf() == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShape->rankOf());
    const uint numOfSpatialDims = blockShape->sizeAt(0);
    const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
    REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0));
    // FIXME - should we use this time-consuming validation ?
    for (uint i = 0; i < numOfSpatialDims; ++i) {
        const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
        REQUIRE_TRUE(blockSize >= 2, 0, "BatchToSpaceND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
    }
    if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) {
        const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
        REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(crop).c_str());
    }
    // FIXME - should we use this time-consuming validation ?
    for (uint i = 0; i < numOfSpatialDims; ++i) {
        const auto cropLeft      = crop->e<uint>(i,0);
        const auto cropRight     = crop->e<uint>(i,1);
        const auto outSpatialDim = input->sizeAt(i + 1) * blockShape->e<Nd4jLong>(i) - cropLeft - cropRight;
        REQUIRE_TRUE(outSpatialDim >= 0, 0, "BatchToSpaceND: crop left/right values are too big and cause negative output spatial dimension/dimensions !");
    }
    helpers::batchToSpaceND(block.launchContext(), *input, *blockShape, *crop, *output);
    return Status::OK();
 }
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(batch_to_space_nd) {
    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_INTS})
                     ->setAllowedInputTypes(2, {ALL_INTS})
                     ->setSameMode(true);
 }
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_SHAPE_FN(batch_to_space_nd) {
    auto inputShapeInfo = inputShape->at(0);
    auto blockShapeInfo = inputShape->at(1);
    auto cropShapeInfo  = inputShape->at(2);
    REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]);
    const auto product = INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
    REQUIRE_TRUE(inputShapeInfo[1] % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, inputShapeInfo[1]);
    const auto numOfSpatialDims = blockShapeInfo[1];
    if(cropShapeInfo[1] != numOfSpatialDims || cropShapeInfo[2] != 2) {
        const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
        REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(cropShapeInfo).c_str());
    }
    std::vector<Nd4jLong> outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]);
    outShape[0] /= product;
    for (uint i = 0; i < numOfSpatialDims; ++i)
        outShape[i + 1] = outShape[i + 1] * INPUT_VARIABLE(1)->e<Nd4jLong>(i) - INPUT_VARIABLE(2)->e<uint>(i,0) - INPUT_VARIABLE(2)->e<uint>(i,1);
    return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inputShapeInfo), 'c', outShape));
 }
 }
 }
 #endif
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 //
 // @author raver119@gmail.com, created on 19.01.18.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 // @author raver119@gmail.com
 //
 #include <op_boilerplate.h>
@ -39,12 +39,11 @@ CUSTOM_OP_IMPL(space_to_batch, 2, 1, false, 0, 1) {
    const uint blockSize = INT_ARG(0);
    REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatch: integer parameter block_size must be >= 2, but got %i instead", blockSize);
-    const int rank = input->rankOf();
+    REQUIRE_TRUE(input->rankOf() == 4,  0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead",  input->rankOf());
-    REQUIRE_TRUE(rank == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", rank);
+    REQUIRE_TRUE(output->rankOf() == 4, 0, "SpaceToBatch: rank of output array must be equal 4, but got %i instead", output->rankOf());
-    const std::string expectedpaddingShape = "[2, 2]";
+    if(padding->sizeAt(0) != 2 || padding->sizeAt(1) != 2)
-    const std::string actualpaddingShape = ShapeUtils::shapeAsString(padding);
+        REQUIRE_TRUE(false, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(padding).c_str());
    REQUIRE_TRUE(actualpaddingShape == expectedpaddingShape, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", actualpaddingShape.c_str());
    const uint padBottom = padding->e<uint>(0,0);
    const uint padTop    = padding->e<uint>(0,1);
@ -78,9 +77,8 @@ DECLARE_SHAPE_FN(space_to_batch) {
    const int rank = inputShapeInfo[0];
    REQUIRE_TRUE(rank == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", rank);
-    const std::string expectedpaddingShape = "[2, 2]";
+    if(paddingShapeInfo[1] != 2 || paddingShapeInfo[1] != 2)
-    const std::string actualpaddingShape = ShapeUtils::shapeAsString(paddingShapeInfo);
+        REQUIRE_TRUE(false, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(paddingShapeInfo).c_str());
    REQUIRE_TRUE(actualpaddingShape == expectedpaddingShape, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", actualpaddingShape.c_str());
    const uint padBottom = INPUT_VARIABLE(1)->e<Nd4jLong>(0,0);
    const uint padTop    = INPUT_VARIABLE(1)->e<Nd4jLong>(0,1);
--- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp
@ -0,0 +1,108 @@
 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 //
 // @author Yurii Shyrma (iuriish@yahoo.com)
 //
 #include <op_boilerplate.h>
 #if NOT_EXCLUDED(OP_space_to_batch_nd)
 #include <ops/declarable/headers/parity_ops.h>
 #include <ops/declarable/helpers/s_t_b.h>
 namespace nd4j {
 namespace ops  {
 CUSTOM_OP_IMPL(space_to_batch_nd, 3, 1, false, 0, 0) {
    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockSize[0], (iW + padLeft + padRight)/blockSize[1], iC]
    auto input      = INPUT_VARIABLE(0);
    auto blockShape = INPUT_VARIABLE(1);
    auto padding    = INPUT_VARIABLE(2);
    auto output = OUTPUT_VARIABLE(0);
    REQUIRE_TRUE(blockShape->rankOf() == 1, 0, "SpaceToBatchND: rank of blockShape array must be equal to one, but got %i instead !", blockShape->rankOf());
    const uint numOfSpatialDims = blockShape->sizeAt(0);
    REQUIRE_TRUE(input->rankOf() == output->rankOf(), 0, "SpaceToBatchND: rank of input and output array must be the same, but got %i and %i correspondingly !", input->rankOf(), output->rankOf());
    // FIXME - should we use this time-consuming validation ?
    for (uint i = 0; i < numOfSpatialDims; ++i) {
        const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
        REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatchND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i);
    }
    if(padding->sizeAt(0) != numOfSpatialDims || padding->sizeAt(1) != 2) {
        const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
        REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(padding).c_str());
    }
    // FIXME - should we use this time-consuming validation ?
    for (uint i = 0; i < numOfSpatialDims; ++i) {
        const uint padLeft       = padding->e<uint>(i,0);
        const uint padRight      = padding->e<uint>(i,1);
        const Nd4jLong blockSize = blockShape->e<Nd4jLong>(i);
        REQUIRE_TRUE((input->sizeAt(i + 1) + padLeft + padRight) % blockSize == 0, 0, "SpaceToBatchND: after padding, spatial dimensions of input array must be divisible by blockSize !");
    }
    helpers::spaceToBatchND(block.launchContext(), *input, *blockShape, *padding, *output);
    return Status::OK();
 }
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_TYPES(space_to_batch_nd) {
    getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY)
                     ->setAllowedInputTypes(1, {ALL_INTS})
                     ->setAllowedInputTypes(2, {ALL_INTS})
                     ->setSameMode(true);
 }
 ////////////////////////////////////////////////////////////////////////////////
 DECLARE_SHAPE_FN(space_to_batch_nd) {
    auto inputShapeInfo   = inputShape->at(0);
    auto blockShapeInfo   = inputShape->at(1);
    auto paddingShapeInfo = inputShape->at(2);
    REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "SpaceToBatchND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]);
    const uint numOfSpatialDims = blockShapeInfo[1];
    if(paddingShapeInfo[1] != numOfSpatialDims || paddingShapeInfo[2] != 2) {
        const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]";   // [numOfSpatialDims, 2]
        REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(paddingShapeInfo).c_str());
    }
    std::vector<Nd4jLong> outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]);
    outShape[0] *= INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e<Nd4jLong>(0);
    for (uint i = 0; i < numOfSpatialDims; ++i)
        outShape[i + 1] = (outShape[i + 1] + INPUT_VARIABLE(2)->e<uint>(i,0) + INPUT_VARIABLE(2)->e<uint>(i,1)) / INPUT_VARIABLE(1)->e<Nd4jLong>(i);
    return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inputShapeInfo), 'c', outShape));
 }
 }
 }
 #endif
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@ -617,6 +617,10 @@ namespace nd4j {
        DECLARE_CUSTOM_OP(space_to_batch, 2, 1, false, 0, 1);
        #endif
        #if NOT_EXCLUDED(OP_space_to_batch_nd)
        DECLARE_CUSTOM_OP(space_to_batch_nd, 3, 1, false, 0, 0);
        #endif
        /**
         *
         *
@ -624,6 +628,9 @@ namespace nd4j {
        #if NOT_EXCLUDED(OP_batch_to_space)
        DECLARE_CUSTOM_OP(batch_to_space, 2, 1, false, 0, 1);
        #endif
        #if NOT_EXCLUDED(OP_batch_to_space_nd)
        DECLARE_CUSTOM_OP(batch_to_space_nd, 3, 1, false, 0, 0);
        #endif
        /**
         * top_k operation returns a vector of k top values for
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@ -15,8 +15,8 @@
 ******************************************************************************/
 //
 // @author raver119@gmail.com, created on 19.01.18.
 // @author Yurii Shyrma (iuriish@yahoo.com)
 // @author raver119@gmail.com
 //
 #include <ops/declarable/helpers/s_t_b.h>
@ -90,13 +90,107 @@ void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    }
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims) {
    // input [bS, H * blockShape[0], W * blockShape[1], iC]
    // output [bS, H * blockShape[0] - cropBottom - cropTop, W * blockShape[1] - cropLeft - cropRight, iC]
    // if (cropTop = cropBottom = cropRight = cropLeft = 0) shapes are the same
    // else:
    // oH -> [cropBottom, iH - cropTop]
    // oW -> [cropLeft,   iH - cropRight]
    // xLen >= zLen
    const T* x = input.bufferAsT<T>();
          T* z = output.bufferAsT<T>();
    const int rank = input.rankOf();
    const Nd4jLong zLen = output.lengthOf();
    std::vector<Nd4jLong> coords(rank);
    // loop through input array
    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
    for (Nd4jLong i = 0; i < zLen; ++i) {
        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
        // evaluate spatial coordinates for x
        for(uint j = 1; j <= numOfSpatialDims; ++j)
            coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
        z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
    }
 }
 BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
 //////////////////////////////////////////////////////////////////////////
 void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
    // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
    const uint rank = input.rankOf();
    const uint numOfSpatialDims = blockShape.sizeAt(0);
    //*** construct reshaping std::vector for first reshape of input array ***//
    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
    int i;
    for(i = 0; i < numOfSpatialDims; ++i)
        temp[i] = blockShape.e<Nd4jLong>(i);
    temp[i++] = output.sizeAt(0);
    for(int j = 1; j < rank; ++i, ++j)
        temp[i] = input.sizeAt(j);
    NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
    //*** construct permuting std::vector for permutation of input array ***//
    temp[0] = numOfSpatialDims;
    for(i = 1; i <= numOfSpatialDims; ++i) {
        temp[2*i - 1] = numOfSpatialDims + i;
        temp[2*i]     = i - 1;
    }
    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
        temp[i] = i;
    inputRearranged0.permutei(temp);
    if(input.lengthOf() == output.lengthOf()) {
        output.assign(inputRearranged0);
    }
    else {
        //*** construct reshaping std::vector for second reshape of input array ***//
        temp.resize(rank);
        temp[0] = output.sizeAt(0);
        for(i = 1; i < rank; ++i)
            temp[i] = (i <= numOfSpatialDims) ? input.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : input.sizeAt(i);
        NDArray inputRearranged1 = inputRearranged0.reshape(input.ordering(), temp);
        BUILD_SINGLE_SELECTOR(input.dataType(), batchToSpaceND_, (inputRearranged1, crop, output, numOfSpatialDims), LIBND4J_TYPES);
    }
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight) {
    // input [bS, H * blockSize - padBottom - padTop, W * blockSize - padLeft - padRight, iC]
-    // output [bs, H * blockSize, W * blockSize, iC]
+    // output [bS, H * blockSize, W * blockSize, iC]
    // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same
    // else:
@ -145,26 +239,153 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC]
-    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)});
+    NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)});
    outputRearranged0.permutei({2, 3,0, 4,1, 5});
    if(input.lengthOf() == output.lengthOf()) {
        outputRearranged0.assign(input);
    }
    else {
-        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)});
+        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)});
        BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatch_, (input, outputRearranged1, padBottom, padTop, padLeft, padRight), LIBND4J_TYPES);
        if(output.getBuffer() != outputRearranged1.getBuffer())
            outputRearranged0.assign(outputRearranged1);
    }
 }
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims) {
    // 4D example
    // input [bS, H * blockShape[0] - padBottom - padTop, W * blockShape[1] - padLeft - padRight, iC]
    // output [bS, H * blockShape[0], W * blockShape[1], iC]
    // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same
    // else:
    // iH -> [padBottom, oH - padTop]
    // iW -> [padLeft,   oW - padRight]
    // zLen > xLen
    const T* x = input.bufferAsT<T>();
          T* z = output.bufferAsT<T>();
    const int rank = input.rankOf();
    const Nd4jLong zLen = output.lengthOf();
    std::vector<Nd4jLong> coords(rank);
    // loop through output array
    PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
    for (Nd4jLong i = 0; i < zLen; ++i) {
        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
        bool within = true;
        for(uint j = 1; j <= numOfSpatialDims; ++j) {
            const auto padLeft  = padding.e<uint>(j - 1, 0);
            const auto padRight = padding.e<uint>(j - 1, 1);
            within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
            if(!within)
                break;
            coords[j] -= padLeft;       // get coordinates for x
        }
        if(within)
            z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
        else
            z[zOffset] = 0.f;
    }
 }
 BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES);
 //////////////////////////////////////////////////////////////////////////
 void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
    // 4D example with two spatial dimensions
    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC]
    const uint rank = input.rankOf();
    const uint numOfSpatialDims = blockShape.sizeAt(0);
    //*** construct reshaping std::vector for first reshape of output array ***//
    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
    int i;
    for(i = 0; i < numOfSpatialDims; ++i)
        temp[i] = blockShape.e<Nd4jLong>(i);
    temp[i++] = input.sizeAt(0);
    for(int j = 1; j < rank; ++i, ++j)
        temp[i] = output.sizeAt(j);
    NDArray outputRearranged0 = output.reshape(output.ordering(), temp);
    //*** construct permuting std::vector for permutation of output array ***//
    temp[0] = numOfSpatialDims;
    for(i = 1; i <= numOfSpatialDims; ++i) {
        temp[2*i - 1] = numOfSpatialDims + i;
        temp[2*i]     = i - 1;
    }
    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
        temp[i] = i;
    outputRearranged0.permutei(temp);
    // ****** //
    if(input.lengthOf() == output.lengthOf()) {
        outputRearranged0.assign(input);
    }
    else {
        //*** construct reshaping std::vector for second reshape of output array ***//
        temp.resize(rank);
        temp[0] = input.sizeAt(0);
        for(i = 1; i < rank; ++i)
            temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : output.sizeAt(i);
        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp);
        BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatchND_, (input, padding, outputRearranged1, numOfSpatialDims), LIBND4J_TYPES);
        if(output.getBuffer() != outputRearranged1.getBuffer())
            outputRearranged0.assign(outputRearranged1);
    }
 }
 /*
    template <int N, bool B2S>
    struct SpaceToBatchHelper {
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
@ -135,10 +135,10 @@ __global__ static void col2imCuda2(const void *columns, void *image, const Nd4jL
          for (int i = (blockDim.x * blockIdx.x) + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {
              T val = 0;
              int w_im = i % iW + pW;
              int h_im = (i / iW) % iH + pH;
              int c_im = i / (iW * iH);
              int b = c_im / iC;
              int c = c_im % iC;
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@ -39,8 +39,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
          T* col = reinterpret_cast<T*>(columns);
    __shared__ int colRank, volRank;
-    __shared__ Nd4jLong colLen, iD, iH, iW;
+    __shared__ Nd4jLong colLen, iD, iH, iW, *sharedMem;
    __shared__ Nd4jLong *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@ -38,13 +38,13 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
    // else:
    // oH -> [cropBottom, iH - cropTop]
    // oW -> [cropLeft,   iH - cropRight]
-    // xLen > zLen
+    // xLen >= zLen
    const auto x = reinterpret_cast<const T*>(vx);
          auto z = reinterpret_cast<T*>(vz);
    __shared__ int rank;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -52,7 +52,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
    __syncthreads();
@ -116,6 +115,139 @@ void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    }
 }
 ///////////////////////////////////////////////////////////////////
 template<typename X, typename Y>
 __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShapeInfo,
                                          const void* vy, const Nd4jLong* yShapeInfo,
                                                void* vz, const Nd4jLong* zShapeInfo,
                                          const uint numOfSpatialDims) {
    // 4D example, numOfSpatialDims = 2
    // input [bS, H * blockShape[0], W * blockShape[1], iC]
    // output [bS, H * blockShape[0] - cropBottom - cropTop, W * blockShape[1] - cropLeft - cropRight, iC]
    // if (cropTop = cropBottom = cropRight = cropLeft = 0) shapes are the same
    // else:
    // oH -> [cropBottom, iH - cropTop]
    // oW -> [cropLeft,   iH - cropRight]
    // xLen >= zLen
    const auto x = reinterpret_cast<const X*>(vx);
    const auto y = reinterpret_cast<const Y*>(vy);
          auto z = reinterpret_cast<X*>(vz);
    __shared__ int rank;
    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
    }
    __syncthreads();
    auto coords = sharedMem + threadIdx.x * rank;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
        // evaluate spatial coordinates for x
        for(uint j = 1; j <= numOfSpatialDims; ++j) {
            const auto yOffset  = (j - 1) * yShapeInfo[3];  // yRank = 2, calculate offset manually
            coords[j] += y[yOffset];                        // add crop left
        }
        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
        z[zOffset] = x[xOffset];
    }
 }
 ///////////////////////////////////////////////////////////////////
 template<typename X,typename Y>
 static void batchToSpaceNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims) {
    batchToSpaceNDCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, numOfSpatialDims);
 }
 BUILD_DOUBLE_TEMPLATE(template void batchToSpaceNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
 //////////////////////////////////////////////////////////////////////////
 void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) {
    // 4D example, numOfSpatialDims = 2 - two spatial dimensions
    // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop  - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC]
    const uint rank = input.rankOf();
    const uint numOfSpatialDims = blockShape.sizeAt(0);
    //*** construct reshaping std::vector for first reshape of input array ***//
    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
    int i;
    for(i = 0; i < numOfSpatialDims; ++i)
        temp[i] = blockShape.e<Nd4jLong>(i);
    temp[i++] = output.sizeAt(0);
    for(int j = 1; j < rank; ++i, ++j)
        temp[i] = input.sizeAt(j);
    NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
    //*** construct permuting std::vector for permutation of input array ***//
    temp[0] = numOfSpatialDims;
    for(i = 1; i <= numOfSpatialDims; ++i) {
        temp[2*i - 1] = numOfSpatialDims + i;
        temp[2*i]     = i - 1;
    }
    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
        temp[i] = i;
    inputRearranged0.permutei(temp);
    if(input.lengthOf() == output.lengthOf()) {
        output.assign(inputRearranged0);
    }
    else {
        //*** construct reshaping std::vector for second reshape of input array ***//
        temp.resize(rank);
        temp[0] = output.sizeAt(0);
        for(i = 1; i < rank; ++i)
            temp[i] = (i <= numOfSpatialDims) ? input.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : input.sizeAt(i);
        NDArray inputRearranged1 = inputRearranged0.reshape(input.ordering(), temp);
        const int threadsPerBlock = MAX_NUM_THREADS / 4;
        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
        PointersManager manager(context, "batchToSpaceND");
        NDArray::prepareSpecialUse({&output}, {&inputRearranged1, &crop});
        BUILD_DOUBLE_SELECTOR(input.dataType(), crop.dataType(), batchToSpaceNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), inputRearranged1.getSpecialBuffer(), inputRearranged1.getSpecialShapeInfo(), crop.getSpecialBuffer(), crop.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
        NDArray::registerSpecialUse({&output}, {&inputRearranged1, &crop});
        manager.synchronize();
    }
 }
 ///////////////////////////////////////////////////////////////////
 template<typename T>
 __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight) {
@ -133,7 +265,7 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
          auto z = reinterpret_cast<T*>(vz);
    __shared__ int rank;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -141,7 +273,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
    __syncthreads();
@ -210,6 +341,153 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o
    }
 }
 ///////////////////////////////////////////////////////////////////
 template<typename X, typename Y>
 __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShapeInfo,
                                          const void* vy, const Nd4jLong* yShapeInfo,
                                                void* vz, const Nd4jLong* zShapeInfo,
                                          const uint numOfSpatialDims) {
    // x - input, y - padding, z - output
    // 4D example
    // input [bS, H * blockShape[0] - padBottom - padTop, W * blockShape[1] - padLeft - padRight, iC]
    // output [bS, H * blockShape[0], W * blockShape[1], iC]
    // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same
    // else:
    // iH -> [padBottom, oH - padTop]
    // iW -> [padLeft,   oW - padRight]
    // zLen > xLen
    const auto x = reinterpret_cast<const X*>(vx);
    const auto y = reinterpret_cast<const Y*>(vy);
          auto z = reinterpret_cast<X*>(vz);
    __shared__ int rank;    // xRank = zRank, yRank = 2;
    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
    }
    __syncthreads();
    auto coords = sharedMem + threadIdx.x * rank;
    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < zLen; i += totalThreads) {
        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
        bool within = true;
        for(uint j = 1; j <= numOfSpatialDims; ++j) {
            // yRank = 2, calculate offset manually
            const auto yOffset  = (j - 1) * yShapeInfo[3];
            const auto padLeft  = y[yOffset];
            const auto padRight = y[yOffset + yShapeInfo[4]];
            within &= (coords[j] >= padLeft && coords[j] < shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo))[j] - padRight);
            if(!within)
                break;
            coords[j] -= padLeft;       // get coordinates for x
        }
        if(within)
            z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
        else
            z[zOffset] = 0.f;
    }
 }
 ///////////////////////////////////////////////////////////////////
 template<typename X, typename Y>
 static void spaceToBatchNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims) {
    spaceToBatchNDCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, numOfSpatialDims);
 }
 BUILD_DOUBLE_TEMPLATE(template void spaceToBatchNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,  const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
 //////////////////////////////////////////////////////////////////////////
 void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) {
    // 4D example with two spatial dimensions
    // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC]
    const uint rank = input.rankOf();
    const uint numOfSpatialDims = blockShape.sizeAt(0);
    //*** construct reshaping std::vector for first reshape of output array ***//
    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
    int i;
    for(i = 0; i < numOfSpatialDims; ++i)
        temp[i] = blockShape.e<Nd4jLong>(i);
    temp[i++] = input.sizeAt(0);
    for(int j = 1; j < rank; ++i, ++j)
        temp[i] = output.sizeAt(j);
    NDArray outputRearranged0 = output.reshape(output.ordering(), temp);
    //*** construct permuting std::vector for permutation of output array ***//
    temp[0] = numOfSpatialDims;
    for(i = 1; i <= numOfSpatialDims; ++i) {
        temp[2*i - 1] = numOfSpatialDims + i;
        temp[2*i]     = i - 1;
    }
    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
        temp[i] = i;
    outputRearranged0.permutei(temp);
    // ****** //
    if(input.lengthOf() == output.lengthOf()) {
        outputRearranged0.assign(input);
    }
    else {
        //*** construct reshaping std::vector for second reshape of output array ***//
        temp.resize(rank);
        temp[0] = input.sizeAt(0);
        for(i = 1; i < rank; ++i)
            temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e<Nd4jLong>(i - 1) : output.sizeAt(i);
        NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp);
        const int threadsPerBlock = MAX_NUM_THREADS / 4;
        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
        PointersManager manager(context, "spaceToBatchND");
        NDArray::prepareSpecialUse({&outputRearranged1}, {&input, &padding});
        BUILD_DOUBLE_SELECTOR(input.dataType(), padding.dataType(), spaceToBatchNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), padding.getSpecialBuffer(), padding.getSpecialShapeInfo(), outputRearranged1.specialBuffer(), outputRearranged1.specialShapeInfo(), numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES);
        NDArray::registerSpecialUse({&outputRearranged1}, {&input, &padding});
        manager.synchronize();
        if(output.getSpecialBuffer() != outputRearranged1.getSpecialBuffer())
            outputRearranged0.assign(outputRearranged1);
    }
 }
 /*
    template <int N, bool B2S>
--- a/libnd4j/include/ops/declarable/helpers/s_t_b.h
+++ b/libnd4j/include/ops/declarable/helpers/s_t_b.h
@ -31,6 +31,10 @@ namespace helpers {
    void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize);
    void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output);
    void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output);
 /*
    // this method MUST be platform-specific
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@ -689,3 +689,161 @@ TEST_F(DeclarableOpsTests13, cyclic_rshift_bits_1) {
    delete result;
 }
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, space_to_batch_nd_1) {
    NDArray x('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32);
    NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32);    // three spatial dimensions
    NDArray paddings('c', {3, 2}, {0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32);
    NDArray exp('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32);
    x.linspace(1);
    exp.linspace(1);
    nd4j::ops::space_to_batch_nd op;
    auto result = op.execute({&x, &blockShape, &paddings}, {}, {});
    ASSERT_EQ(Status::OK(), result->status());
    auto z = result->at(0);
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
    delete result;
 }
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, space_to_batch_nd_2) {
    NDArray x('c', {2,  2,4,3,  1}, nd4j::DataType::FLOAT32);
    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
    NDArray paddings('c', {3, 2}, {0,0,  0,2,  2,1} , nd4j::DataType::INT32);
    NDArray exp('c', {24, 1,3,2, 1}, { 0, 2, 0, 8, 0, 0, 0, 26, 0, 32, 0, 0, 0, 3, 0, 9, 0, 0, 0, 27, 0, 33, 0, 0, 1,
                                        0, 7, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 29, 0, 35, 0, 0, 0, 6,
                                        0, 12, 0, 0, 0, 30, 0, 36, 0, 0, 4, 0, 10, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 14,
                                        0, 20, 0, 0, 0, 38, 0, 44, 0, 0, 0, 15, 0, 21, 0, 0, 0, 39, 0, 45, 0, 0, 13, 0,
                                        19, 0, 0, 0, 37, 0, 43, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 41, 0, 47, 0, 0, 0, 18,
                                        0, 24, 0, 0, 0, 42, 0, 48, 0, 0, 16, 0, 22, 0, 0, 0, 40, 0, 46, 0, 0, 0}, nd4j::DataType::FLOAT32);
    x.linspace(1);
    nd4j::ops::space_to_batch_nd op;
    auto result = op.execute({&x, &blockShape, &paddings}, {}, {});
    ASSERT_EQ(Status::OK(), result->status());
    auto z = result->at(0);
    // z->printBuffer();
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
    delete result;
 }
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, space_to_batch_nd_3) {
    NDArray x('c', {2,  2,4,3,  1}, nd4j::DataType::FLOAT32);
    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
    NDArray paddings('c', {3, 2}, {1,1,  0,2,  2,1} , nd4j::DataType::INT32);
    NDArray exp('c', {24, 2,3,2, 1}, { 0, 0, 0, 0, 0, 0, 0, 14, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15,
                                        0, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 45, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 19, 0, 0, 0, 0, 0, 0, 0,
                                        0, 0, 37, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 47, 0, 0,
                                        0, 0, 0, 0, 0, 0, 0, 18, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0, 48, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0,
                                        22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 46, 0, 0, 0, 0, 2, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 32,
                                        0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                                        0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 0, 0, 0,
                                        0, 0, 0, 29, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, 36, 0, 0,
                                        0, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0}, nd4j::DataType::FLOAT32);
    x.linspace(1);
    nd4j::ops::space_to_batch_nd op;
    auto result = op.execute({&x, &blockShape, &paddings}, {}, {});
    ASSERT_EQ(Status::OK(), result->status());
    auto z = result->at(0);
    // z->printBuffer();
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
    delete result;
 }
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batch_to_space_nd_1) {
    NDArray x('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32);
    NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32);    // three spatial dimensions
    NDArray crop('c', {3, 2}, {0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32);
    NDArray exp('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32);
    x.linspace(1);
    exp.linspace(1);
    nd4j::ops::batch_to_space_nd op;
    auto result = op.execute({&x, &blockShape, &crop}, {}, {});
    ASSERT_EQ(Status::OK(), result->status());
    auto z = result->at(0);
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
    delete result;
 }
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batch_to_space_nd_2) {
    NDArray x('c', {24, 1,3,2, 1}, nd4j::DataType::FLOAT32);
    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
    NDArray crop('c', {3, 2}, {0,0,  0,2,  2,1} , nd4j::DataType::INT32);
    NDArray exp('c', {2,  2,4,3,  1}, {25, 2, 14, 61, 38, 50, 27, 4, 16, 63, 40, 52, 97, 74, 86, 133, 110, 122, 99, 76, 88, 135, 112, 124,
                                      31, 8, 20, 67, 44, 56, 33, 10, 22, 69, 46, 58, 103, 80, 92, 139, 116, 128, 105, 82, 94, 141, 118, 130}, nd4j::DataType::FLOAT32);
    x.linspace(1);
    nd4j::ops::batch_to_space_nd op;
    auto result = op.execute({&x, &blockShape, &crop}, {}, {});
    ASSERT_EQ(Status::OK(), result->status());
    auto z = result->at(0);
    // z->printBuffer();
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
    delete result;
 }
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests13, batch_to_space_nd_3) {
    NDArray x('c', {24, 2,3,2, 1}, nd4j::DataType::FLOAT32);
    NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32);    // three spatial dimensions
    NDArray crop('c', {3, 2}, {1,1,  0,2,  2,1} , nd4j::DataType::INT32);
    NDArray exp('c', {2,  2,4,3,  1}, {193, 146, 170, 265, 218, 242, 195, 148, 172, 267, 220, 244, 55, 8, 32, 127, 80, 104, 57, 10, 34, 129, 82,
                                    106, 205, 158, 182, 277, 230, 254, 207, 160, 184, 279, 232, 256, 67, 20, 44, 139, 92, 116, 69, 22, 46, 141, 94, 118}, nd4j::DataType::FLOAT32);
    x.linspace(1);
    nd4j::ops::batch_to_space_nd op;
    auto result = op.execute({&x, &blockShape, &crop}, {}, {});
    ASSERT_EQ(Status::OK(), result->status());
    auto z = result->at(0);
    // z->printBuffer();
    ASSERT_TRUE(exp.isSameShape(z));
    ASSERT_TRUE(exp.equalsTo(z));
    delete result;
 }