From eea3062ccfbf19841f4c3fea3f4182dfbe5b13d8 Mon Sep 17 00:00:00 2001 From: raver119 Date: Wed, 21 Aug 2019 21:11:46 +0300 Subject: [PATCH] [WIP] stb/bts nd (#144) * - start working on space_to_batch_nd Signed-off-by: Yurii * - provide cpu helper for space_to_batch_nd op Signed-off-by: Yurii * few typos fixed Signed-off-by: raver119 * - add tests for space_to_batch and correct bugs Signed-off-by: Yurii * - write cuda kernel for space_to_batch op Signed-off-by: Yurii * - add order argument to shape::index2coords method in convolution cuda ops Signed-off-by: Yurii * - restore some previous code Signed-off-by: Yurii * old col2im kernel activated Signed-off-by: raver119 * - change coords calculation in col2im kernel Signed-off-by: Yurii * - restore old col2im kernel Signed-off-by: Yurii * - add custom op for batch_to_space Signed-off-by: Yurii * - provide cpu version for batch_to_space_nd op Signed-off-by: Yurii * - provide cuda kernel for batch_to_space_nd op Signed-off-by: Yurii --- libnd4j/blas/NDArray.h | 1 + .../generic/parity_ops/batch_to_space.cpp | 11 +- .../generic/parity_ops/batch_to_space_nd.cpp | 131 ++++++++ .../generic/parity_ops/space_to_batch.cpp | 16 +- .../generic/parity_ops/space_to_batch_nd.cpp | 108 +++++++ .../ops/declarable/headers/parity_ops.h | 7 + .../ops/declarable/helpers/cpu/s_t_b.cpp | 231 +++++++++++++- .../ops/declarable/helpers/cuda/col2im.cu | 8 +- .../declarable/helpers/cuda/convolutions.cu | 21 +- .../ops/declarable/helpers/cuda/s_t_b.cu | 288 +++++++++++++++++- .../include/ops/declarable/helpers/s_t_b.h | 4 + .../layers_tests/DeclarableOpsTests13.cpp | 158 ++++++++++ 12 files changed, 943 insertions(+), 41 deletions(-) create mode 100644 libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp create mode 100644 libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index 5d8e2d47a..3237e5033 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -599,6 +599,7 @@ namespace nd4j { /** * apply scalar operation to array * extraParams - extra parameters for operation + * returns scalar array */ NDArray reduceNumber(nd4j::reduce::FloatOps ops, void *extraParams = nullptr) const; NDArray reduceNumber(nd4j::reduce::SameOps ops, void *extraParams = nullptr) const; diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp index d5ad76110..fe2575f1f 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space.cpp @@ -30,7 +30,6 @@ limitations under the License. ==============================================================================*/ // -// @author raver119@gmail.com, created on 19.01.18. // @author Yurii Shyrma (iuriish@yahoo.com) // @@ -63,9 +62,8 @@ CUSTOM_OP_IMPL(batch_to_space, 2, 1, false, 0, 1) { REQUIRE_TRUE(rank == 4, 0, "BatchToSpace: rank of input array must be equal 4, but got %i instead", rank); REQUIRE_TRUE(dim0 % (blockSize * blockSize) == 0, 0, "BatchToSpace: first dimension of input array must be divisible by blockSize * blockSize (that is by %i), but got first dimension equal to %i", blockSize * blockSize, dim0); - const std::string expectedCropShape = "[2, 2]"; - const std::string actualCropShape = ShapeUtils::shapeAsString(crop); - REQUIRE_TRUE(actualCropShape == expectedCropShape, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", actualCropShape.c_str()); + if(crop->sizeAt(0) != 2 || crop->sizeAt(1) != 2) + REQUIRE_TRUE(false, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(crop).c_str()); const uint cropBottom = crop->e(0,0); const uint cropTop = crop->e(0,1); @@ -104,9 +102,8 @@ DECLARE_SHAPE_FN(batch_to_space) { REQUIRE_TRUE(rank == 4, 0, "BatchToSpace: rank of input array must be equal 4, but got %i instead", rank); REQUIRE_TRUE(dim0 % (blockSize * blockSize) == 0, 0, "BatchToSpace: first dimension of input array must be divisible by blockSize * blockSize (that is by %i), but got first dimension equal to %i", blockSize * blockSize, dim0); - const std::string expectedCropShape = "[2, 2]"; - const std::string actualCropShape = ShapeUtils::shapeAsString(cropShapeInfo); - REQUIRE_TRUE(actualCropShape == expectedCropShape, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", actualCropShape.c_str()); + if(cropShapeInfo[1] != 2 || cropShapeInfo[2] != 2) + REQUIRE_TRUE(false, 0, "BatchToSpace: operation expects crop shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(cropShapeInfo).c_str()); const uint cropBottom = INPUT_VARIABLE(1)->e(0,0); const uint cropTop = INPUT_VARIABLE(1)->e(0,1); diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp new file mode 100644 index 000000000..bc26e39ec --- /dev/null +++ b/libnd4j/include/ops/declarable/generic/parity_ops/batch_to_space_nd.cpp @@ -0,0 +1,131 @@ +/******************************************************************************* + * Copyright (c) 2015-2018 Skymind, Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Apache License, Version 2.0 which is available at + * https://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * SPDX-License-Identifier: Apache-2.0 + ******************************************************************************/ + +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + +#include +#if NOT_EXCLUDED(OP_batch_to_space_nd) + +#include +#include + +namespace nd4j { +namespace ops { + + +CUSTOM_OP_IMPL(batch_to_space_nd, 3, 1, false, 0, 0) { + + // 4D example, numOfSpatialDims = 2 - two spatial dimensions + // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC] + + auto input = INPUT_VARIABLE(0); + auto blockShape = INPUT_VARIABLE(1); + auto crop = INPUT_VARIABLE(2); + + auto output = OUTPUT_VARIABLE(0); + + REQUIRE_TRUE(blockShape->rankOf() == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShape->rankOf()); + + const uint numOfSpatialDims = blockShape->sizeAt(0); + + const auto product = blockShape->reduceNumber(nd4j::reduce::Prod).e(0); + REQUIRE_TRUE(input->sizeAt(0) % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, input->sizeAt(0)); + + // FIXME - should we use this time-consuming validation ? + for (uint i = 0; i < numOfSpatialDims; ++i) { + const Nd4jLong blockSize = blockShape->e(i); + REQUIRE_TRUE(blockSize >= 2, 0, "BatchToSpaceND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i); + } + + if(crop->sizeAt(0) != numOfSpatialDims || crop->sizeAt(1) != 2) { + const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2] + REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(crop).c_str()); + } + + // FIXME - should we use this time-consuming validation ? + for (uint i = 0; i < numOfSpatialDims; ++i) { + const auto cropLeft = crop->e(i,0); + const auto cropRight = crop->e(i,1); + const auto outSpatialDim = input->sizeAt(i + 1) * blockShape->e(i) - cropLeft - cropRight; + REQUIRE_TRUE(outSpatialDim >= 0, 0, "BatchToSpaceND: crop left/right values are too big and cause negative output spatial dimension/dimensions !"); + } + + helpers::batchToSpaceND(block.launchContext(), *input, *blockShape, *crop, *output); + + return Status::OK(); +} + +//////////////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(batch_to_space_nd) { + + getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY) + ->setAllowedInputTypes(1, {ALL_INTS}) + ->setAllowedInputTypes(2, {ALL_INTS}) + ->setSameMode(true); +} + +//////////////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(batch_to_space_nd) { + + auto inputShapeInfo = inputShape->at(0); + auto blockShapeInfo = inputShape->at(1); + auto cropShapeInfo = inputShape->at(2); + + REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "BatchToSpaceND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]); + + const auto product = INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e(0); + REQUIRE_TRUE(inputShapeInfo[1] % product == 0, 0, "BatchToSpaceND: first dimension of input array must be divisible by product of blockShape array elements (= %lld), but got first dimension equal to %i", product, inputShapeInfo[1]); + + const auto numOfSpatialDims = blockShapeInfo[1]; + + if(cropShapeInfo[1] != numOfSpatialDims || cropShapeInfo[2] != 2) { + const std::string expectedCropShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2] + REQUIRE_TRUE(false, 0, "BatchToSpaceND: operation expects padding shape to be %s, but got %s instead", expectedCropShape.c_str(), ShapeUtils::shapeAsString(cropShapeInfo).c_str()); + } + + + std::vector outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]); + + outShape[0] /= product; + + for (uint i = 0; i < numOfSpatialDims; ++i) + outShape[i + 1] = outShape[i + 1] * INPUT_VARIABLE(1)->e(i) - INPUT_VARIABLE(2)->e(i,0) - INPUT_VARIABLE(2)->e(i,1); + + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inputShapeInfo), 'c', outShape)); +} + + +} +} + +#endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp index 23f2c42c6..2f297c893 100644 --- a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp +++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch.cpp @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ // -// @author raver119@gmail.com, created on 19.01.18. // @author Yurii Shyrma (iuriish@yahoo.com) +// @author raver119@gmail.com // #include @@ -39,12 +39,11 @@ CUSTOM_OP_IMPL(space_to_batch, 2, 1, false, 0, 1) { const uint blockSize = INT_ARG(0); REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatch: integer parameter block_size must be >= 2, but got %i instead", blockSize); - const int rank = input->rankOf(); - REQUIRE_TRUE(rank == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", rank); + REQUIRE_TRUE(input->rankOf() == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", input->rankOf()); + REQUIRE_TRUE(output->rankOf() == 4, 0, "SpaceToBatch: rank of output array must be equal 4, but got %i instead", output->rankOf()); - const std::string expectedpaddingShape = "[2, 2]"; - const std::string actualpaddingShape = ShapeUtils::shapeAsString(padding); - REQUIRE_TRUE(actualpaddingShape == expectedpaddingShape, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", actualpaddingShape.c_str()); + if(padding->sizeAt(0) != 2 || padding->sizeAt(1) != 2) + REQUIRE_TRUE(false, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(padding).c_str()); const uint padBottom = padding->e(0,0); const uint padTop = padding->e(0,1); @@ -78,9 +77,8 @@ DECLARE_SHAPE_FN(space_to_batch) { const int rank = inputShapeInfo[0]; REQUIRE_TRUE(rank == 4, 0, "SpaceToBatch: rank of input array must be equal 4, but got %i instead", rank); - const std::string expectedpaddingShape = "[2, 2]"; - const std::string actualpaddingShape = ShapeUtils::shapeAsString(paddingShapeInfo); - REQUIRE_TRUE(actualpaddingShape == expectedpaddingShape, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", actualpaddingShape.c_str()); + if(paddingShapeInfo[1] != 2 || paddingShapeInfo[1] != 2) + REQUIRE_TRUE(false, 0, "SpaceToBatch: operation expects padding shape to be {2, 2}, but got %s instead", ShapeUtils::shapeAsString(paddingShapeInfo).c_str()); const uint padBottom = INPUT_VARIABLE(1)->e(0,0); const uint padTop = INPUT_VARIABLE(1)->e(0,1); diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp new file mode 100644 index 000000000..085743c98 --- /dev/null +++ b/libnd4j/include/ops/declarable/generic/parity_ops/space_to_batch_nd.cpp @@ -0,0 +1,108 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +// +// @author Yurii Shyrma (iuriish@yahoo.com) +// + +#include +#if NOT_EXCLUDED(OP_space_to_batch_nd) + +#include +#include + +namespace nd4j { +namespace ops { + + +CUSTOM_OP_IMPL(space_to_batch_nd, 3, 1, false, 0, 0) { + + // 4D example, numOfSpatialDims = 2 - two spatial dimensions + // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockSize[0], (iW + padLeft + padRight)/blockSize[1], iC] + + auto input = INPUT_VARIABLE(0); + auto blockShape = INPUT_VARIABLE(1); + auto padding = INPUT_VARIABLE(2); + + auto output = OUTPUT_VARIABLE(0); + + REQUIRE_TRUE(blockShape->rankOf() == 1, 0, "SpaceToBatchND: rank of blockShape array must be equal to one, but got %i instead !", blockShape->rankOf()); + + const uint numOfSpatialDims = blockShape->sizeAt(0); + + REQUIRE_TRUE(input->rankOf() == output->rankOf(), 0, "SpaceToBatchND: rank of input and output array must be the same, but got %i and %i correspondingly !", input->rankOf(), output->rankOf()); + + // FIXME - should we use this time-consuming validation ? + for (uint i = 0; i < numOfSpatialDims; ++i) { + const Nd4jLong blockSize = blockShape->e(i); + REQUIRE_TRUE(blockSize >= 2, 0, "SpaceToBatchND: all elements of blockShape array must be >= 2, but got value of %i for element number %i !", blockSize, i); + } + + if(padding->sizeAt(0) != numOfSpatialDims || padding->sizeAt(1) != 2) { + const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2] + REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(padding).c_str()); + } + + // FIXME - should we use this time-consuming validation ? + for (uint i = 0; i < numOfSpatialDims; ++i) { + const uint padLeft = padding->e(i,0); + const uint padRight = padding->e(i,1); + const Nd4jLong blockSize = blockShape->e(i); + REQUIRE_TRUE((input->sizeAt(i + 1) + padLeft + padRight) % blockSize == 0, 0, "SpaceToBatchND: after padding, spatial dimensions of input array must be divisible by blockSize !"); + } + + helpers::spaceToBatchND(block.launchContext(), *input, *blockShape, *padding, *output); + + return Status::OK(); +} + +//////////////////////////////////////////////////////////////////////////////// +DECLARE_TYPES(space_to_batch_nd) { + + getOpDescriptor()->setAllowedInputTypes(0, nd4j::DataType::ANY) + ->setAllowedInputTypes(1, {ALL_INTS}) + ->setAllowedInputTypes(2, {ALL_INTS}) + ->setSameMode(true); +} + +//////////////////////////////////////////////////////////////////////////////// +DECLARE_SHAPE_FN(space_to_batch_nd) { + + auto inputShapeInfo = inputShape->at(0); + auto blockShapeInfo = inputShape->at(1); + auto paddingShapeInfo = inputShape->at(2); + + REQUIRE_TRUE(blockShapeInfo[0] == 1, 0, "SpaceToBatchND: rank of blockShape array must be equal to one, but got %i instead !", blockShapeInfo[0]); + + const uint numOfSpatialDims = blockShapeInfo[1]; + + if(paddingShapeInfo[1] != numOfSpatialDims || paddingShapeInfo[2] != 2) { + const std::string expectedpaddingShape = "[" + std::to_string(numOfSpatialDims) + ", 2]"; // [numOfSpatialDims, 2] + REQUIRE_TRUE(false, 0, "SpaceToBatchND: operation expects padding shape to be %s, but got %s instead", expectedpaddingShape.c_str(), ShapeUtils::shapeAsString(paddingShapeInfo).c_str()); + } + + std::vector outShape(inputShapeInfo + 1, inputShapeInfo + 1 + inputShapeInfo[0]); + + outShape[0] *= INPUT_VARIABLE(1)->reduceNumber(nd4j::reduce::Prod).e(0); + + for (uint i = 0; i < numOfSpatialDims; ++i) + outShape[i + 1] = (outShape[i + 1] + INPUT_VARIABLE(2)->e(i,0) + INPUT_VARIABLE(2)->e(i,1)) / INPUT_VARIABLE(1)->e(i); + + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ArrayOptions::dataType(inputShapeInfo), 'c', outShape)); +} + +} +} + +#endif diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h index 39b8fabfa..f9278fb36 100644 --- a/libnd4j/include/ops/declarable/headers/parity_ops.h +++ b/libnd4j/include/ops/declarable/headers/parity_ops.h @@ -617,6 +617,10 @@ namespace nd4j { DECLARE_CUSTOM_OP(space_to_batch, 2, 1, false, 0, 1); #endif + #if NOT_EXCLUDED(OP_space_to_batch_nd) + DECLARE_CUSTOM_OP(space_to_batch_nd, 3, 1, false, 0, 0); + #endif + /** * * @@ -624,6 +628,9 @@ namespace nd4j { #if NOT_EXCLUDED(OP_batch_to_space) DECLARE_CUSTOM_OP(batch_to_space, 2, 1, false, 0, 1); #endif + #if NOT_EXCLUDED(OP_batch_to_space_nd) + DECLARE_CUSTOM_OP(batch_to_space_nd, 3, 1, false, 0, 0); + #endif /** * top_k operation returns a vector of k top values for diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp index afed3ee94..cc97e3c5b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp @@ -15,8 +15,8 @@ ******************************************************************************/ // -// @author raver119@gmail.com, created on 19.01.18. // @author Yurii Shyrma (iuriish@yahoo.com) +// @author raver119@gmail.com // #include @@ -90,13 +90,107 @@ void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& o } } +////////////////////////////////////////////////////////////////////////// +template +static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims) { + + // input [bS, H * blockShape[0], W * blockShape[1], iC] + // output [bS, H * blockShape[0] - cropBottom - cropTop, W * blockShape[1] - cropLeft - cropRight, iC] + + // if (cropTop = cropBottom = cropRight = cropLeft = 0) shapes are the same + // else: + // oH -> [cropBottom, iH - cropTop] + // oW -> [cropLeft, iH - cropRight] + // xLen >= zLen + + const T* x = input.bufferAsT(); + T* z = output.bufferAsT(); + + const int rank = input.rankOf(); + const Nd4jLong zLen = output.lengthOf(); + + std::vector coords(rank); + + // loop through input array + PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) + + for (Nd4jLong i = 0; i < zLen; ++i) { + + shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data()); + + const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank); + + // evaluate spatial coordinates for x + for(uint j = 1; j <= numOfSpatialDims; ++j) + coords[j] += crop.e(j - 1, 0); // add crop left + + z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)]; + } +} + +BUILD_SINGLE_TEMPLATE(template void batchToSpaceND_, (const NDArray& input, const NDArray& crop, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES); + +////////////////////////////////////////////////////////////////////////// +void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) { + + // 4D example, numOfSpatialDims = 2 - two spatial dimensions + // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC] + + const uint rank = input.rankOf(); + const uint numOfSpatialDims = blockShape.sizeAt(0); + + //*** construct reshaping std::vector for first reshape of input array ***// + + std::vector temp(numOfSpatialDims + rank); + + int i; + for(i = 0; i < numOfSpatialDims; ++i) + temp[i] = blockShape.e(i); + temp[i++] = output.sizeAt(0); + for(int j = 1; j < rank; ++i, ++j) + temp[i] = input.sizeAt(j); + + NDArray inputRearranged0 = input.reshape(input.ordering(), temp); + + //*** construct permuting std::vector for permutation of input array ***// + + temp[0] = numOfSpatialDims; + + for(i = 1; i <= numOfSpatialDims; ++i) { + temp[2*i - 1] = numOfSpatialDims + i; + temp[2*i] = i - 1; + } + for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i) + temp[i] = i; + + inputRearranged0.permutei(temp); + + + if(input.lengthOf() == output.lengthOf()) { + output.assign(inputRearranged0); + } + else { + //*** construct reshaping std::vector for second reshape of input array ***// + + temp.resize(rank); + + temp[0] = output.sizeAt(0); + + for(i = 1; i < rank; ++i) + temp[i] = (i <= numOfSpatialDims) ? input.sizeAt(i) * blockShape.e(i - 1) : input.sizeAt(i); + + NDArray inputRearranged1 = inputRearranged0.reshape(input.ordering(), temp); + + BUILD_SINGLE_SELECTOR(input.dataType(), batchToSpaceND_, (inputRearranged1, crop, output, numOfSpatialDims), LIBND4J_TYPES); + } +} ////////////////////////////////////////////////////////////////////////// template static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight) { // input [bS, H * blockSize - padBottom - padTop, W * blockSize - padLeft - padRight, iC] - // output [bs, H * blockSize, W * blockSize, iC] + // output [bS, H * blockSize, W * blockSize, iC] // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same // else: @@ -145,26 +239,153 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockSize*blockSize, (iH + padBottom + padTop)/blockSize, (iW + padLeft + padRight)/blockSize, iC] - NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), input.sizeAt(3)}); + NDArray outputRearranged0 = output.reshape(output.ordering(), {blockSize, blockSize, input.sizeAt(0), output.sizeAt(1), output.sizeAt(2), output.sizeAt(3)}); outputRearranged0.permutei({2, 3,0, 4,1, 5}); if(input.lengthOf() == output.lengthOf()) { outputRearranged0.assign(input); } else { - NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, input.sizeAt(3)}); + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), {input.sizeAt(0), output.sizeAt(1) * blockSize, output.sizeAt(2) * blockSize, output.sizeAt(3)}); BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatch_, (input, outputRearranged1, padBottom, padTop, padLeft, padRight), LIBND4J_TYPES); if(output.getBuffer() != outputRearranged1.getBuffer()) outputRearranged0.assign(outputRearranged1); } - } + + + + + + + + + + + + + + +////////////////////////////////////////////////////////////////////////// +template +static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims) { + + // 4D example + // input [bS, H * blockShape[0] - padBottom - padTop, W * blockShape[1] - padLeft - padRight, iC] + // output [bS, H * blockShape[0], W * blockShape[1], iC] + + // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same + // else: + // iH -> [padBottom, oH - padTop] + // iW -> [padLeft, oW - padRight] + // zLen > xLen + + const T* x = input.bufferAsT(); + T* z = output.bufferAsT(); + + const int rank = input.rankOf(); + const Nd4jLong zLen = output.lengthOf(); + + std::vector coords(rank); + + // loop through output array + PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) + for (Nd4jLong i = 0; i < zLen; ++i) { + + shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data()); + + const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank); + + bool within = true; + + for(uint j = 1; j <= numOfSpatialDims; ++j) { + + const auto padLeft = padding.e(j - 1, 0); + const auto padRight = padding.e(j - 1, 1); + + within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight); + + if(!within) + break; + + coords[j] -= padLeft; // get coordinates for x + } + + if(within) + z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)]; + else + z[zOffset] = 0.f; + } +} + +BUILD_SINGLE_TEMPLATE(template void spaceToBatchND_, (const NDArray& input, const NDArray& padding, NDArray& output, const uint numOfSpatialDims), LIBND4J_TYPES); + +////////////////////////////////////////////////////////////////////////// +void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) { + + // 4D example with two spatial dimensions + // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC] + + const uint rank = input.rankOf(); + + const uint numOfSpatialDims = blockShape.sizeAt(0); + + //*** construct reshaping std::vector for first reshape of output array ***// + std::vector temp(numOfSpatialDims + rank); + + int i; + for(i = 0; i < numOfSpatialDims; ++i) + temp[i] = blockShape.e(i); + temp[i++] = input.sizeAt(0); + for(int j = 1; j < rank; ++i, ++j) + temp[i] = output.sizeAt(j); + + NDArray outputRearranged0 = output.reshape(output.ordering(), temp); + + //*** construct permuting std::vector for permutation of output array ***// + + temp[0] = numOfSpatialDims; + + for(i = 1; i <= numOfSpatialDims; ++i) { + temp[2*i - 1] = numOfSpatialDims + i; + temp[2*i] = i - 1; + } + for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i) + temp[i] = i; + + outputRearranged0.permutei(temp); + + // ****** // + + if(input.lengthOf() == output.lengthOf()) { + outputRearranged0.assign(input); + } + else { + + //*** construct reshaping std::vector for second reshape of output array ***// + temp.resize(rank); + + temp[0] = input.sizeAt(0); + + for(i = 1; i < rank; ++i) + temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e(i - 1) : output.sizeAt(i); + + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp); + + BUILD_SINGLE_SELECTOR(input.dataType(), spaceToBatchND_, (input, padding, outputRearranged1, numOfSpatialDims), LIBND4J_TYPES); + + if(output.getBuffer() != outputRearranged1.getBuffer()) + outputRearranged0.assign(outputRearranged1); + } +} + + /* template struct SpaceToBatchHelper { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu index a1725ff2d..9ab7337c2 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu @@ -135,10 +135,10 @@ __global__ static void col2imCuda2(const void *columns, void *image, const Nd4jL for (int i = (blockDim.x * blockIdx.x) + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { T val = 0; + int w_im = i % iW + pW; int h_im = (i / iW) % iH + pH; int c_im = i / (iW * iH); - int b = c_im / iC; int c = c_im % iC; @@ -180,9 +180,9 @@ __global__ static void col2imCuda2(const void *columns, void *image, const Nd4jL ////////////////////////////////////////////////////////////////////////// template static void col2imCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, - const void* columns, const Nd4jLong* colShapeInfo, - void* image, const Nd4jLong* imShapeInfo, - const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) { + const void* columns, const Nd4jLong* colShapeInfo, + void* image, const Nd4jLong* imShapeInfo, + const int sH, const int sW, const int pH, const int pW, const int dH, const int dW) { col2imCuda2<<<512, 512, 1024, *stream>>>(columns, image, colShapeInfo, imShapeInfo, sH, sW, pH, pW, dH, dW); //col2imCuda<<>>(columns, colShapeInfo, image, imShapeInfo, sH, sW, pH, pW, dH, dW); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu index a37078ad9..991cdb660 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu @@ -39,8 +39,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI T* col = reinterpret_cast(columns); __shared__ int colRank, volRank; - __shared__ Nd4jLong colLen, iD, iH, iW; - __shared__ Nd4jLong *sharedMem; + __shared__ Nd4jLong colLen, iD, iH, iW, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -1339,9 +1338,9 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn extern __shared__ unsigned char shmem[]; sharedMem = reinterpret_cast(shmem); - dimIH = isNCHW ? 2 : 1; - zLen = shape::length(zShapeInfo); - rank = 4; + dimIH = isNCHW ? 2 : 1; + zLen = shape::length(zShapeInfo); + rank = 4; } __syncthreads(); @@ -1408,8 +1407,8 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn sharedMem = reinterpret_cast(shmem); dimID = isNCDHW ? 2 : 1; - zLen = shape::length(zShapeInfo); - rank = 5; + zLen = shape::length(zShapeInfo); + rank = 5; } __syncthreads(); @@ -1478,8 +1477,8 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape sharedMem = reinterpret_cast(shmem); dimIH = isNCHW ? 2 : 1; - zLen = shape::length(zShapeInfo); - rank = 4; + zLen = shape::length(zShapeInfo); + rank = 4; factorH = xShapeInfo[dimIH + 1] / zShapeInfo[dimIH + 1]; factorW = xShapeInfo[dimIH + 2] / zShapeInfo[dimIH + 2]; @@ -1550,8 +1549,8 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape sharedMem = reinterpret_cast(shmem); dimID = isNCDHW ? 2 : 1; - zLen = shape::length(zShapeInfo); - rank = 5; + zLen = shape::length(zShapeInfo); + rank = 5; factorD = xShapeInfo[dimID + 1] / zShapeInfo[dimID + 1]; factorH = xShapeInfo[dimID + 2] / zShapeInfo[dimID + 2]; diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu index bcd484fe9..0ac0a1882 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu @@ -38,13 +38,13 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn // else: // oH -> [cropBottom, iH - cropTop] // oW -> [cropLeft, iH - cropRight] - // xLen > zLen + // xLen >= zLen const auto x = reinterpret_cast(vx); auto z = reinterpret_cast(vz); __shared__ int rank; - __shared__ Nd4jLong zLen, totalThreads, *sharedMem; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -52,7 +52,6 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn rank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); - totalThreads = gridDim.x * blockDim.x; } __syncthreads(); @@ -116,6 +115,139 @@ void batchToSpace(nd4j::LaunchContext* context, const NDArray& input, NDArray& o } } + + +/////////////////////////////////////////////////////////////////// +template +__global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShapeInfo, + const void* vy, const Nd4jLong* yShapeInfo, + void* vz, const Nd4jLong* zShapeInfo, + const uint numOfSpatialDims) { + + // 4D example, numOfSpatialDims = 2 + // input [bS, H * blockShape[0], W * blockShape[1], iC] + // output [bS, H * blockShape[0] - cropBottom - cropTop, W * blockShape[1] - cropLeft - cropRight, iC] + + // if (cropTop = cropBottom = cropRight = cropLeft = 0) shapes are the same + // else: + // oH -> [cropBottom, iH - cropTop] + // oW -> [cropLeft, iH - cropRight] + // xLen >= zLen + + const auto x = reinterpret_cast(vx); + const auto y = reinterpret_cast(vy); + auto z = reinterpret_cast(vz); + + __shared__ int rank; + __shared__ Nd4jLong zLen, *sharedMem; + + if (threadIdx.x == 0) { + + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + rank = shape::rank(zShapeInfo); + zLen = shape::length(zShapeInfo); + } + + __syncthreads(); + + auto coords = sharedMem + threadIdx.x * rank; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) { + + shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + + const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + + // evaluate spatial coordinates for x + for(uint j = 1; j <= numOfSpatialDims; ++j) { + const auto yOffset = (j - 1) * yShapeInfo[3]; // yRank = 2, calculate offset manually + coords[j] += y[yOffset]; // add crop left + } + + const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank); + + z[zOffset] = x[xOffset]; + } +} + +/////////////////////////////////////////////////////////////////// +template +static void batchToSpaceNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims) { + + batchToSpaceNDCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, numOfSpatialDims); +} +BUILD_DOUBLE_TEMPLATE(template void batchToSpaceNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES); + +////////////////////////////////////////////////////////////////////////// +void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output) { + + // 4D example, numOfSpatialDims = 2 - two spatial dimensions + // [bS*blockShape[0]*blockShape[1], iH, iW, iC] is rearranged/permuted to [bS, iH*blockShape[0] - cropTop - cropBottom, iW*blockShape[1] - cropLeft - cropRight, iC] + + const uint rank = input.rankOf(); + const uint numOfSpatialDims = blockShape.sizeAt(0); + + //*** construct reshaping std::vector for first reshape of input array ***// + + std::vector temp(numOfSpatialDims + rank); + + int i; + for(i = 0; i < numOfSpatialDims; ++i) + temp[i] = blockShape.e(i); + temp[i++] = output.sizeAt(0); + for(int j = 1; j < rank; ++i, ++j) + temp[i] = input.sizeAt(j); + + NDArray inputRearranged0 = input.reshape(input.ordering(), temp); + + //*** construct permuting std::vector for permutation of input array ***// + + temp[0] = numOfSpatialDims; + + for(i = 1; i <= numOfSpatialDims; ++i) { + temp[2*i - 1] = numOfSpatialDims + i; + temp[2*i] = i - 1; + } + for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i) + temp[i] = i; + + inputRearranged0.permutei(temp); + + + if(input.lengthOf() == output.lengthOf()) { + + output.assign(inputRearranged0); + } + else { + //*** construct reshaping std::vector for second reshape of input array ***// + + temp.resize(rank); + + temp[0] = output.sizeAt(0); + + for(i = 1; i < rank; ++i) + temp[i] = (i <= numOfSpatialDims) ? input.sizeAt(i) * blockShape.e(i - 1) : input.sizeAt(i); + + NDArray inputRearranged1 = inputRearranged0.reshape(input.ordering(), temp); + + const int threadsPerBlock = MAX_NUM_THREADS / 4; + const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128; + + PointersManager manager(context, "batchToSpaceND"); + + NDArray::prepareSpecialUse({&output}, {&inputRearranged1, &crop}); + BUILD_DOUBLE_SELECTOR(input.dataType(), crop.dataType(), batchToSpaceNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), inputRearranged1.getSpecialBuffer(), inputRearranged1.getSpecialShapeInfo(), crop.getSpecialBuffer(), crop.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES); + NDArray::registerSpecialUse({&output}, {&inputRearranged1, &crop}); + + manager.synchronize(); + } +} + + + /////////////////////////////////////////////////////////////////// template __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight) { @@ -133,7 +265,7 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn auto z = reinterpret_cast(vz); __shared__ int rank; - __shared__ Nd4jLong zLen, totalThreads, *sharedMem; + __shared__ Nd4jLong zLen, *sharedMem; if (threadIdx.x == 0) { extern __shared__ unsigned char shmem[]; @@ -141,7 +273,6 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn rank = shape::rank(zShapeInfo); zLen = shape::length(zShapeInfo); - totalThreads = gridDim.x * blockDim.x; } __syncthreads(); @@ -210,6 +341,153 @@ void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& o } } +/////////////////////////////////////////////////////////////////// +template +__global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShapeInfo, + const void* vy, const Nd4jLong* yShapeInfo, + void* vz, const Nd4jLong* zShapeInfo, + const uint numOfSpatialDims) { + + // x - input, y - padding, z - output + + // 4D example + // input [bS, H * blockShape[0] - padBottom - padTop, W * blockShape[1] - padLeft - padRight, iC] + // output [bS, H * blockShape[0], W * blockShape[1], iC] + + // if (padTop = padBottom = padRight = padLeft = 0) shapes are the same + // else: + // iH -> [padBottom, oH - padTop] + // iW -> [padLeft, oW - padRight] + // zLen > xLen + + const auto x = reinterpret_cast(vx); + const auto y = reinterpret_cast(vy); + auto z = reinterpret_cast(vz); + + __shared__ int rank; // xRank = zRank, yRank = 2; + __shared__ Nd4jLong zLen, totalThreads, *sharedMem; + + if (threadIdx.x == 0) { + + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); + + rank = shape::rank(zShapeInfo); + zLen = shape::length(zShapeInfo); + totalThreads = gridDim.x * blockDim.x; + } + + __syncthreads(); + + auto coords = sharedMem + threadIdx.x * rank; + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < zLen; i += totalThreads) { + + shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + + const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + + bool within = true; + + for(uint j = 1; j <= numOfSpatialDims; ++j) { + + // yRank = 2, calculate offset manually + const auto yOffset = (j - 1) * yShapeInfo[3]; + const auto padLeft = y[yOffset]; + const auto padRight = y[yOffset + yShapeInfo[4]]; + + within &= (coords[j] >= padLeft && coords[j] < shape::shapeOf(const_cast(zShapeInfo))[j] - padRight); + + if(!within) + break; + + coords[j] -= padLeft; // get coordinates for x + } + + if(within) + z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + else + z[zOffset] = 0.f; + } +} + +/////////////////////////////////////////////////////////////////// +template +static void spaceToBatchNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims) { + + spaceToBatchNDCuda<<>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, numOfSpatialDims); +} +BUILD_DOUBLE_TEMPLATE(template void spaceToBatchNDCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const void* vy, const Nd4jLong* yShapeInfo, void* vz, const Nd4jLong* zShapeInfo, const uint numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES); + +////////////////////////////////////////////////////////////////////////// +void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output ) { + + // 4D example with two spatial dimensions + // [bS, iH, iW, iC] is rearranged/permuted to [bS*blockShape[0]*blockShape[1], (iH + padBottom + padTop)/blockShape[0], (iW + padLeft + padRight)/blockShape[1], iC] + + const uint rank = input.rankOf(); + + const uint numOfSpatialDims = blockShape.sizeAt(0); + + //*** construct reshaping std::vector for first reshape of output array ***// + std::vector temp(numOfSpatialDims + rank); + + int i; + for(i = 0; i < numOfSpatialDims; ++i) + temp[i] = blockShape.e(i); + temp[i++] = input.sizeAt(0); + for(int j = 1; j < rank; ++i, ++j) + temp[i] = output.sizeAt(j); + + NDArray outputRearranged0 = output.reshape(output.ordering(), temp); + + //*** construct permuting std::vector for permutation of output array ***// + + temp[0] = numOfSpatialDims; + + for(i = 1; i <= numOfSpatialDims; ++i) { + temp[2*i - 1] = numOfSpatialDims + i; + temp[2*i] = i - 1; + } + for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i) + temp[i] = i; + + outputRearranged0.permutei(temp); + + // ****** // + + if(input.lengthOf() == output.lengthOf()) { + outputRearranged0.assign(input); + } + else { + + //*** construct reshaping std::vector for second reshape of output array ***// + temp.resize(rank); + + temp[0] = input.sizeAt(0); + + for(i = 1; i < rank; ++i) + temp[i] = (i <= numOfSpatialDims) ? output.sizeAt(i) * blockShape.e(i - 1) : output.sizeAt(i); + + NDArray outputRearranged1 = outputRearranged0.reshape(output.ordering(), temp); + + const int threadsPerBlock = MAX_NUM_THREADS / 4; + const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128; + + PointersManager manager(context, "spaceToBatchND"); + + NDArray::prepareSpecialUse({&outputRearranged1}, {&input, &padding}); + BUILD_DOUBLE_SELECTOR(input.dataType(), padding.dataType(), spaceToBatchNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), padding.getSpecialBuffer(), padding.getSpecialShapeInfo(), outputRearranged1.specialBuffer(), outputRearranged1.specialShapeInfo(), numOfSpatialDims), LIBND4J_TYPES, INTEGER_TYPES); + NDArray::registerSpecialUse({&outputRearranged1}, {&input, &padding}); + + manager.synchronize(); + + if(output.getSpecialBuffer() != outputRearranged1.getSpecialBuffer()) + outputRearranged0.assign(outputRearranged1); + } +} + /* template diff --git a/libnd4j/include/ops/declarable/helpers/s_t_b.h b/libnd4j/include/ops/declarable/helpers/s_t_b.h index d8c47979f..e761905fd 100644 --- a/libnd4j/include/ops/declarable/helpers/s_t_b.h +++ b/libnd4j/include/ops/declarable/helpers/s_t_b.h @@ -31,6 +31,10 @@ namespace helpers { void spaceToBatch(nd4j::LaunchContext* context, const NDArray& input, NDArray& output, const uint padBottom, const uint padTop, const uint padLeft, const uint padRight, const uint blockSize); + void spaceToBatchND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& padding, NDArray& output); + + void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const NDArray& blockShape, const NDArray& crop, NDArray& output); + /* // this method MUST be platform-specific diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp index 014719270..d4f422461 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp @@ -689,3 +689,161 @@ TEST_F(DeclarableOpsTests13, cyclic_rshift_bits_1) { delete result; } +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests13, space_to_batch_nd_1) { + + NDArray x('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32); + NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32); // three spatial dimensions + NDArray paddings('c', {3, 2}, {0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32); + + NDArray exp('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32); + + x.linspace(1); + exp.linspace(1); + + nd4j::ops::space_to_batch_nd op; + auto result = op.execute({&x, &blockShape, &paddings}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests13, space_to_batch_nd_2) { + + NDArray x('c', {2, 2,4,3, 1}, nd4j::DataType::FLOAT32); + NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32); // three spatial dimensions + NDArray paddings('c', {3, 2}, {0,0, 0,2, 2,1} , nd4j::DataType::INT32); + + NDArray exp('c', {24, 1,3,2, 1}, { 0, 2, 0, 8, 0, 0, 0, 26, 0, 32, 0, 0, 0, 3, 0, 9, 0, 0, 0, 27, 0, 33, 0, 0, 1, + 0, 7, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 29, 0, 35, 0, 0, 0, 6, + 0, 12, 0, 0, 0, 30, 0, 36, 0, 0, 4, 0, 10, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 14, + 0, 20, 0, 0, 0, 38, 0, 44, 0, 0, 0, 15, 0, 21, 0, 0, 0, 39, 0, 45, 0, 0, 13, 0, + 19, 0, 0, 0, 37, 0, 43, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 41, 0, 47, 0, 0, 0, 18, + 0, 24, 0, 0, 0, 42, 0, 48, 0, 0, 16, 0, 22, 0, 0, 0, 40, 0, 46, 0, 0, 0}, nd4j::DataType::FLOAT32); + x.linspace(1); + + nd4j::ops::space_to_batch_nd op; + auto result = op.execute({&x, &blockShape, &paddings}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + // z->printBuffer(); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests13, space_to_batch_nd_3) { + + NDArray x('c', {2, 2,4,3, 1}, nd4j::DataType::FLOAT32); + NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32); // three spatial dimensions + NDArray paddings('c', {3, 2}, {1,1, 0,2, 2,1} , nd4j::DataType::INT32); + + NDArray exp('c', {24, 2,3,2, 1}, { 0, 0, 0, 0, 0, 0, 0, 14, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 38, 0, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, + 0, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39, 0, 45, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 19, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 37, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41, 0, 47, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 18, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0, 48, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, + 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 46, 0, 0, 0, 0, 2, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 26, 0, 32, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27, 0, 33, 0, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 11, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 0, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, 36, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0}, nd4j::DataType::FLOAT32); + x.linspace(1); + + nd4j::ops::space_to_batch_nd op; + auto result = op.execute({&x, &blockShape, &paddings}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + // z->printBuffer(); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests13, batch_to_space_nd_1) { + + NDArray x('c', {8, 1, 1, 1, 3}, nd4j::DataType::FLOAT32); + + NDArray blockShape('c', {3}, {2, 2, 2} , nd4j::DataType::INT32); // three spatial dimensions + NDArray crop('c', {3, 2}, {0, 0, 0, 0, 0, 0} , nd4j::DataType::INT32); + + NDArray exp('c', {1, 2, 2, 2, 3}, nd4j::DataType::FLOAT32); + + x.linspace(1); + exp.linspace(1); + + nd4j::ops::batch_to_space_nd op; + auto result = op.execute({&x, &blockShape, &crop}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests13, batch_to_space_nd_2) { + + NDArray x('c', {24, 1,3,2, 1}, nd4j::DataType::FLOAT32); + NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32); // three spatial dimensions + NDArray crop('c', {3, 2}, {0,0, 0,2, 2,1} , nd4j::DataType::INT32); + + NDArray exp('c', {2, 2,4,3, 1}, {25, 2, 14, 61, 38, 50, 27, 4, 16, 63, 40, 52, 97, 74, 86, 133, 110, 122, 99, 76, 88, 135, 112, 124, + 31, 8, 20, 67, 44, 56, 33, 10, 22, 69, 46, 58, 103, 80, 92, 139, 116, 128, 105, 82, 94, 141, 118, 130}, nd4j::DataType::FLOAT32); + x.linspace(1); + + nd4j::ops::batch_to_space_nd op; + auto result = op.execute({&x, &blockShape, &crop}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + // z->printBuffer(); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + +//////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests13, batch_to_space_nd_3) { + + NDArray x('c', {24, 2,3,2, 1}, nd4j::DataType::FLOAT32); + NDArray blockShape('c', {3}, {2, 2, 3} , nd4j::DataType::INT32); // three spatial dimensions + NDArray crop('c', {3, 2}, {1,1, 0,2, 2,1} , nd4j::DataType::INT32); + + NDArray exp('c', {2, 2,4,3, 1}, {193, 146, 170, 265, 218, 242, 195, 148, 172, 267, 220, 244, 55, 8, 32, 127, 80, 104, 57, 10, 34, 129, 82, + 106, 205, 158, 182, 277, 230, 254, 207, 160, 184, 279, 232, 256, 67, 20, 44, 139, 92, 116, 69, 22, 46, 141, 94, 118}, nd4j::DataType::FLOAT32); + x.linspace(1); + + nd4j::ops::batch_to_space_nd op; + auto result = op.execute({&x, &blockShape, &crop}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + // z->printBuffer(); + + ASSERT_TRUE(exp.isSameShape(z)); + ASSERT_TRUE(exp.equalsTo(z)); + + delete result; +} + + +