From e604ffe0d2233bd6c506e26a8f3cdbe99a1bb63a Mon Sep 17 00:00:00 2001 From: raver119 Date: Wed, 21 Aug 2019 21:10:29 +0300 Subject: [PATCH] [WIP] repeat op (#143) * - write new repeat helper (cpu) Signed-off-by: Yurii * - update NDArray::cpu Signed-off-by: Yurii * - update NDArray::repeat cuda Signed-off-by: Yurii --- libnd4j/blas/NDArray.h | 9 +- libnd4j/blas/cpu/NDArray.cpp | 101 +++++-------- libnd4j/blas/cuda/NDArray.cu | 143 ++++++++++++------ libnd4j/include/helpers/ShapeUtils.h | 2 +- libnd4j/include/helpers/impl/ShapeUtils.cpp | 36 ++--- .../loops/cuda/specials/repeatKernel.cu | 97 ------------ libnd4j/include/loops/special_kernels.h | 11 -- .../declarable/generic/transforms/repeat.cpp | 72 +++++---- .../ops/declarable/helpers/cpu/transforms.cpp | 12 +- .../ops/declarable/helpers/transforms.h | 1 - .../layers_tests/DeclarableOpsTests1.cpp | 34 ----- .../layers_tests/DeclarableOpsTests14.cpp | 90 +++++++++++ .../layers_tests/DeclarableOpsTests16.cpp | 14 -- .../layers_tests/MultiDataTypeTests.cpp | 2 +- .../tests_cpu/layers_tests/NDArrayTests.cpp | 2 +- 15 files changed, 298 insertions(+), 328 deletions(-) delete mode 100644 libnd4j/include/loops/cuda/specials/repeatKernel.cu diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h index a55a10768..5d8e2d47a 100644 --- a/libnd4j/blas/NDArray.h +++ b/libnd4j/blas/NDArray.h @@ -316,10 +316,10 @@ namespace nd4j { /** * create a new array by replicating current array by repeats times along given dimension - * dimension - dimension along which to repeat elements + * axis - axis along which to repeat elements * repeats - number of repetitions */ - NDArray* repeat(int dimension, const std::vector& repeats) const; + NDArray* repeat(const int axis, const std::vector& repeats) const; /** * This method fills this array with zeros @@ -344,9 +344,10 @@ namespace nd4j { /** * fill target array by repeating current array - * dimension - dimension along which to repeat elements + * axis - axis along which to repeat elements + * repeats - vector containing numbers of repetition for elements at given axis */ - void repeat(int dimension, NDArray& target) const; + void repeat(const int axis, const std::vector& repeats, NDArray& target) const; /** * creates array which points on certain sub-range of this array, sub-range is defined by given indices diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp index d6c82ea86..24ef100d3 100644 --- a/libnd4j/blas/cpu/NDArray.cpp +++ b/libnd4j/blas/cpu/NDArray.cpp @@ -363,79 +363,62 @@ void NDArray::tile(NDArray& target) const { } } -////////////////////////////////////////////////////////////////////////// -// create new array by repeating it the number of times given by reps -NDArray* NDArray::repeat(int dimension, const std::vector& repeats) const { - auto outShape = ShapeUtils::evalRepeatShape(dimension, repeats, *this); +//////////////////////////////////////////////////////////////////////// +template +static void repeat_(const NDArray& input, NDArray& output, const std::vector& repeats, const int axis) { - // the size of outShape == rank - int rank = rankOf(); // = outShape.size() + const X* x = input.bufferAsT(); + Z* z = output.bufferAsT(); - auto ret = new NDArray('c', outShape, dataType(), getContext()); + const int rank = input.rankOf(); // xRank = zRank + const int zLen = output.lengthOf(); // xLen <= zLen + const int repSize = repeats.size(); - auto retArrs = ret->allTensorsAlongDimension({dimension}); - auto thisArrs = this->allTensorsAlongDimension({dimension}); + std::vector coords(rank); - auto repeatDelta = shape::prodLong(outShape.data(), rank) / this->lengthOf(); - auto numTads = retArrs->size(); + // loop through input array + PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords)) + for (Nd4jLong i = 0; i < zLen; ++i) { - for (int i = 0; i < numTads; i++) { - auto thisTensor = thisArrs->at(i); - auto retTensor = retArrs->at(i); - Nd4jLong retIdx = 0; + shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data()); - for (Nd4jLong k = 0; k < thisTensor->lengthOf(); k++) { - auto s = thisTensor->e(k); - for (Nd4jLong j = 0; j < repeatDelta; j++) - retTensor->p(retIdx++, s); + const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank); + + if(repSize > 1) { + for (uint j = 0; j < repSize; ++j) { + coords[axis] -= repeats[j]; + if (coords[axis] < 0) { + coords[axis] = j; + break; + } + } } - } + else + coords[axis] /= repeats[0]; - delete retArrs; - delete thisArrs; - return ret; + z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)]; + } +} + +////////////////////////////////////////////////////////////////////////// +// create new array by repeating it the number of times given by repeats +NDArray* NDArray::repeat(const int axis, const std::vector& repeats) const { + + auto output = new NDArray('c', ShapeUtils::evalRepeatShape(axis, repeats, *this), dataType(), getContext()); + + BUILD_SINGLE_SELECTOR_TWICE(dataType(), repeat_, (*this, *output, repeats, axis), LIBND4J_TYPES); + + return output; } ////////////////////////////////////////////////////////////////////////// // fill array by repeating it the number of times given by reps -void NDArray::repeat(int dimension, NDArray& target) const { +void NDArray::repeat(const int axis, const std::vector& repeats, NDArray& target) const { - if(dimension < 0) - dimension += rankOf(); + if(!target.isSameShape(ShapeUtils::evalRepeatShape(axis, repeats, *this))) + throw std::invalid_argument("NDArray::repeat(const int axis, const std::vector& repeats, NDArray& target) method: wrong shape of target array!"); - if(rankOf() != target.rankOf()) - throw std::invalid_argument("NDArray::repeat(int dimension, NDArray& target) method: wrong rank of target array it must be equal to this array rank!"); - - Nd4jLong repeatDelta = target.sizeAt(dimension) / sizeAt(dimension); - - if(repeatDelta == 0) - throw std::invalid_argument("NDArray::repeat(int dimension, NDArray& target) method: wrong shape of target array!"); - - - std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(rankOf(), {dimension}); - const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(getShapeInfo(), dimsToExclude); - - for (int i = 0; i < numTads; i++) { - auto thisTensor = (*this)(i, dimsToExclude); - auto retTensor = target(i, dimsToExclude); - int tensorLength = thisTensor.lengthOf(); - int retIdx = 0; - if (isR()) { - for (int k = 0; k < tensorLength; k++) { - auto s = thisTensor.e(k); - for (int j = 0; j < repeatDelta; j++) { - retTensor.p(retIdx++, s); - } - } - } else { - for (int k = 0; k < tensorLength; k++) { - auto s = thisTensor.e(k); - for (int j = 0; j < repeatDelta; j++) { - retTensor.p(retIdx++, s); - } - } - } - } + BUILD_DOUBLE_SELECTOR(dataType(), target.dataType(), repeat_, (*this, target, repeats, axis), LIBND4J_TYPES, LIBND4J_TYPES); } ////////////////////////////////////////////////////////////////////////// diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu index e80e6dddd..f6a05c44b 100644 --- a/libnd4j/blas/cuda/NDArray.cu +++ b/libnd4j/blas/cuda/NDArray.cu @@ -392,64 +392,115 @@ void NDArray::tile(NDArray& target) const { registerSpecialUse({&target}, {this}); } -////////////////////////////////////////////////////////////////////////// -// create new array by repeating it the number of times given by reps -NDArray* NDArray::repeat(int dimension, const std::vector& repeats) const { - auto outShape = ShapeUtils::evalRepeatShape(dimension, repeats, *this); +//////////////////////////////////////////////////////////////////////// +template +__global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo, + void* vz, const Nd4jLong* zShapeInfo, + const int* repeats, const int repSize, + const int axis) { - // the size of outShape == rank - int rank = rankOf(); // = outShape.size() + const X* x = reinterpret_cast(vx); + Z* z = reinterpret_cast(vz); - std::vector newShape(rank); - for (int i = 0; i < rank; i++) - newShape[i] = outShape[i]; + __shared__ int rank; + __shared__ Nd4jLong zLen, totalThreads, *sharedMem; // xLen = zLen - auto ret = new NDArray('c', outShape, dataType(), getContext()); + if (threadIdx.x == 0) { - auto repeatDelta = shape::prodLong(newShape.data(), rank) / this->lengthOf(); - std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(rankOf(), {dimension}); - const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(getShapeInfo(), dimsToExclude); //this->tensorsAlongDimension({dimension}); - std::vector copy({dimension}); + extern __shared__ unsigned char shmem[]; + sharedMem = reinterpret_cast(shmem); - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(ret->getShapeInfo(), copy); + rank = shape::rank(zShapeInfo); // xRank = zRank + zLen = shape::length(zShapeInfo); // xLen <= zLen - prepareSpecialUse({ret}, {this}); - auto stream = getContext()->getCudaStream(); - BUILD_SINGLE_SELECTOR(dataType(), repeatKernelH, (getSpecialBuffer(), ret->getSpecialBuffer(), numTads, lengthOf(), ret->lengthOf(), packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets(), *stream), LIBND4J_TYPES); - registerSpecialUse({ret}, {this}); + totalThreads = gridDim.x * blockDim.x; + } - return ret; + __syncthreads(); + + auto coords = sharedMem + threadIdx.x * rank; + + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (Nd4jLong i = tid; i < zLen; i += totalThreads) { + + shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords); + + const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank); + + if(repSize > 1) { + for (uint j = 0; j < repSize; ++j) { + coords[axis] -= repeats[j]; + if (coords[axis] < 0) { + coords[axis] = j; + break; + } + } + } + else + coords[axis] /= repeats[0]; + + z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]; + } } ////////////////////////////////////////////////////////////////////////// -// fill array by repeating it the number of times given by reps -void NDArray::repeat(int dimension, NDArray& target) const { +template +static void repeatCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, + const void* vx, const Nd4jLong* xShapeInfo, + void* vz, const Nd4jLong* zShapeInfo, + const int* repeats, const int repSize, + const int axis) { - if(dimension < 0) - dimension += rankOf(); - - if(rankOf() != target.rankOf()) - throw std::invalid_argument("NDArray::repeat(int dimension, NDArray& target) method: wrong rank of target array it must be equal to this array rank!"); - - Nd4jLong repeatDelta = target.sizeAt(dimension) / sizeAt(dimension); - - if(repeatDelta == 0) - throw std::invalid_argument("NDArray::repeat(int dimension, NDArray& target) method: wrong shape of target array!"); - - - std::vector dimsToExclude = ShapeUtils::evalDimsToExclude(rankOf(), {dimension}); - const Nd4jLong numTads = ShapeUtils::getNumOfSubArrs(getShapeInfo(), dimsToExclude); - - std::vector copy({dimension}); - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(this->getShapeInfo(), copy); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(target.getShapeInfo(), copy); - - NDArray::prepareSpecialUse({&target}, {this}); - auto stream = getContext()->getCudaStream(); - BUILD_SINGLE_SELECTOR_TWICE(target.dataType(), repeatKernelHH, (getSpecialBuffer(), target.getSpecialBuffer(), numTads, lengthOf(), packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets(), *stream), LIBND4J_TYPES); - NDArray::registerSpecialUse({&target}, {this}); + repeatCuda<<>>(vx, xShapeInfo, vz, zShapeInfo, repeats, repSize, axis); } +BUILD_DOUBLE_TEMPLATE(template void repeatCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void *vx, const Nd4jLong *xShapeInfo, void *vz, const Nd4jLong *zShapeInfo, const int* repeats, const int repSize, const int axis), LIBND4J_TYPES, LIBND4J_TYPES); + +////////////////////////////////////////////////////////////////////////// +// create new array by repeating it the number of times given by repeats +NDArray* NDArray::repeat(const int axis, const std::vector& repeats) const { + + auto output = new NDArray('c', ShapeUtils::evalRepeatShape(axis, repeats, *this), dataType(), getContext()); + + const int threadsPerBlock = MAX_NUM_THREADS / 2; + const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = output->rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + + PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector& repeats)"); + + const int* reps = reinterpret_cast(manager.replicatePointer(repeats.data(), repeats.size() * sizeof(int))); + + prepareSpecialUse({output}, {this}); + BUILD_SINGLE_SELECTOR_TWICE(dataType(), repeatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, getContext()->getCudaStream(), getSpecialBuffer(), getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), reps, repeats.size(), axis), LIBND4J_TYPES); + prepareSpecialUse({output}, {this}); + + manager.synchronize(); + + return output; +} + +////////////////////////////////////////////////////////////////////////// +// fill array by repeating it the number of times given by repeats +void NDArray::repeat(const int axis, const std::vector& repeats, NDArray& target) const { + + if(!target.isSameShape(ShapeUtils::evalRepeatShape(axis, repeats, *this))) + throw std::invalid_argument("NDArray::repeat(const int axis, const std::vector& repeats, NDArray& target) method: wrong shape of target array!"); + + const int threadsPerBlock = MAX_NUM_THREADS / 2; + const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; + const int sharedMem = target.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128; + + PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector& repeats)"); + + const int* reps = reinterpret_cast(manager.replicatePointer(repeats.data(), repeats.size() * sizeof(int))); + + prepareSpecialUse({&target}, {this}); + BUILD_DOUBLE_SELECTOR(dataType(), target.dataType(), repeatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, getContext()->getCudaStream(), getSpecialBuffer(), getSpecialShapeInfo(), target.specialBuffer(), target.specialShapeInfo(), reps, repeats.size(), axis), LIBND4J_TYPES, LIBND4J_TYPES); + prepareSpecialUse({&target}, {this}); + + manager.synchronize(); +} + //////////////////////////////////////////////////////////////////////// void* NDArray::specialBuffer() { diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h index 3f30585b0..1d991e36a 100644 --- a/libnd4j/include/helpers/ShapeUtils.h +++ b/libnd4j/include/helpers/ShapeUtils.h @@ -47,7 +47,7 @@ namespace nd4j { static Nd4jLong* evalReduceShapeInfoEmpty(const char order, std::vector& dimensions, const Nd4jLong *shapeInfo, const nd4j::DataType dataType, const bool keepDims, nd4j::memory::Workspace* workspace); // evaluate shape for array which is result of repeat operation applied to arr - static std::vector evalRepeatShape(int dimension, const std::vector& repeats, const NDArray& arr); + static std::vector evalRepeatShape(int axis, const std::vector& repeats, const NDArray& arr); // evaluate shapeInfo of permuted array static Nd4jLong* evalPermShapeInfo(const int* dimensions, const int rank, const NDArray& arr, nd4j::memory::Workspace* workspace); diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp index 1274582b3..6d93351c3 100644 --- a/libnd4j/include/helpers/impl/ShapeUtils.cpp +++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp @@ -281,37 +281,21 @@ Nd4jLong* ShapeUtils::evalReduceShapeInfo(const char order, std::vector& di ////////////////////////////////////////////////////////////////////////// // evaluate shape for array which is result of repeat operation applied to arr - std::vector ShapeUtils::evalRepeatShape(int dimension, const std::vector& repeats, const NDArray& arr) { +std::vector ShapeUtils::evalRepeatShape(int axis, const std::vector& repeats, const NDArray& arr) { - int rank = arr.rankOf(); + if (axis < 0) + axis += arr.rankOf(); - if (dimension < 0) - dimension += rank; + if(repeats.size() != 1 && repeats.size() != arr.sizeAt(axis)) + throw std::invalid_argument("ShapeUtils::evalRepeatShape: size of repeats vector must be 1 or equal to dimension at given axis !"); - std::vector reps; + std::vector outShape = arr.getShapeAsVector(); - if ((int) reps.size() < rank) { - if (dimension > 0) { - for (int e = 0; e < rank - (int) repeats.size(); e++) - reps.push_back(1); + if(repeats.size() == 1) + outShape[axis] *= repeats[0]; - for (auto r: repeats) - reps.push_back(r); - } else { - for (auto r: repeats) - reps.push_back(r); - - for (int e = 0; e < rank - (int) repeats.size(); e++) - reps.push_back(1); - } - }/* else { - for (auto r: repeats) - reps.push_back(r); - }*/ - - std::vector outShape(rank); - for (int i = 0; i < rank; i++) - outShape[i] = arr.sizeAt(i) * reps.at(i); + else + outShape[axis] = std::accumulate(repeats.begin(), repeats.end(), 0); return outShape; } diff --git a/libnd4j/include/loops/cuda/specials/repeatKernel.cu b/libnd4j/include/loops/cuda/specials/repeatKernel.cu deleted file mode 100644 index c3177049f..000000000 --- a/libnd4j/include/loops/cuda/specials/repeatKernel.cu +++ /dev/null @@ -1,97 +0,0 @@ -/******************************************************************************* - * Copyright (c) 2015-2018 Skymind, Inc. - * - * This program and the accompanying materials are made available under the - * terms of the Apache License, Version 2.0 which is available at - * https://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations - * under the License. - * - * SPDX-License-Identifier: Apache-2.0 - ******************************************************************************/ - -// -// @author GS , created on 17.01.2019 -// - -#include - -namespace nd4j { - - - template - static __global__ void repeatKernel(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, - Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) { - //auto tid = blockIdx.x * blockDim.x; // + threadIdx.x; -// int totalThreads = gridDim.x * blockDim.x; - int totalThreads = blockDim.x; - //const auto resultLength = shape::length(outputShape); - for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) { - auto yOffset = tadInputOffsets[i]; - auto xOffset = tadOutputOffsets[i]; - for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) { - auto outputOffset = shape::getIndexOrderOffset(j, tadOnlyOutputShapeInfo, inputLength, shape::order(tadOnlyInputShapeInfo)); - auto inputOffset = shape::getIndexOrderOffset(j, tadOnlyInputShapeInfo, inputLength, shape::order(tadOnlyInputShapeInfo)); - *(reinterpret_cast(outputBuffer) + xOffset + outputOffset) = *(reinterpret_cast(inputBuffer) + yOffset + inputOffset); - } - } - } - BUILD_SINGLE_TEMPLATE(template __global__ void repeatKernel, (void const* inputBuffer, void* outputBuffer, - Nd4jLong numTads, Nd4jLong inputLength, Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets), LIBND4J_TYPES); - - template - static __global__ void repeatKernelDouble(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, - Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets) { - //auto tid = blockIdx.x * blockDim.x; // + threadIdx.x; - int totalThreads = gridDim.x * blockDim.x; - //const auto resultLength = shape::length(outputShape); - for (Nd4jLong i = blockIdx.x; i < numTads; i += gridDim.x) { - auto yOffset = tadInputOffsets[i]; - auto xOffset = tadOutputOffsets[i]; - for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) { - auto outputOffset = shape::getIndexOrderOffset(j, tadOnlyOutputShapeInfo, inputLength, shape::order(tadOnlyInputShapeInfo)); - auto inputOffset = shape::getIndexOrderOffset(j, tadOnlyInputShapeInfo, inputLength, shape::order(tadOnlyInputShapeInfo)); - *(reinterpret_cast(outputBuffer) + xOffset + outputOffset) = static_cast(*(reinterpret_cast(inputBuffer) + yOffset + inputOffset)); - } - } - } - BUILD_SINGLE_TEMPLATE_TWICE(template __global__ void repeatKernelDouble, (void const* inputBuffer, void* outputBuffer, - Nd4jLong numTads, Nd4jLong inputLength, Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets), LIBND4J_TYPES); - - template - void repeatKernelH(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, Nd4jLong outputLength, - Nd4jLong *tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong *tadOnlyOutputShapeInfo,Nd4jLong *tadOutputOffsets, - cudaStream_t stream) { - dim3 launchDims(256, 512, 8192); - repeatKernel<<>>(inputBuffer, outputBuffer, numTads, inputLength, tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets); - } - BUILD_SINGLE_TEMPLATE(template void repeatKernelH, (void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, Nd4jLong outputLength, - Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets, - cudaStream_t stream), LIBND4J_TYPES); - - - template - void repeatKernelHH(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, - Nd4jLong *tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong *tadOnlyOutputShapeInfo,Nd4jLong *tadOutputOffsets, - cudaStream_t stream) { - dim3 launchDims(256, 512, 8192); - repeatKernelDouble<<>>(inputBuffer, outputBuffer, numTads, inputLength, tadOnlyInputShapeInfo, tadInputOffsets, tadOnlyOutputShapeInfo, tadOutputOffsets); - } - BUILD_SINGLE_TEMPLATE_TWICE(template void repeatKernelHH, (void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, - Nd4jLong* tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong* tadOnlyOutputShapeInfo, Nd4jLong *tadOutputOffsets, - cudaStream_t stream), LIBND4J_TYPES); - - -} \ No newline at end of file diff --git a/libnd4j/include/loops/special_kernels.h b/libnd4j/include/loops/special_kernels.h index 37356efcd..df7c0c617 100644 --- a/libnd4j/include/loops/special_kernels.h +++ b/libnd4j/include/loops/special_kernels.h @@ -89,17 +89,6 @@ namespace nd4j { template _CUDA_H void tileKernelHH(void const* inputBuffer, Nd4jLong* inputShape, void* outputBuffer, Nd4jLong* outputShape, Nd4jLong resultLength, Nd4jLong ews, cudaStream_t *stream); - template - _CUDA_H void repeatKernelH(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, Nd4jLong outputLength, - Nd4jLong *tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong *tadOnlyOutputShapeInfo,Nd4jLong *tadOutputOffsets, - cudaStream_t stream); - - template - _CUDA_H void repeatKernelHH(void const* inputBuffer, void* outputBuffer, Nd4jLong numTads, Nd4jLong inputLength, - Nd4jLong *tadOnlyInputShapeInfo, Nd4jLong *tadInputOffsets, - Nd4jLong *tadOnlyOutputShapeInfo,Nd4jLong *tadOutputOffsets, - cudaStream_t stream); class NDArray; template diff --git a/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp b/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp index 1a1ecf96e..f3e7b84a5 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/repeat.cpp @@ -15,7 +15,7 @@ ******************************************************************************/ // -// @author raver119@gmail.com +// @author Yurii Shyrma (iuriish@yahoo.com) // #include @@ -24,38 +24,52 @@ #include namespace nd4j { - namespace ops { - ////////////////////////////////////////////////////////////////////////// - // here iArgs is int vector of repeats at the beginning and last element in iArgs is dimension - CUSTOM_OP_IMPL(repeat, 1, 1, true, 0, -1) { +namespace ops { - auto x = INPUT_VARIABLE(0); - auto ret = OUTPUT_VARIABLE(0); +////////////////////////////////////////////////////////////////////////// +// here iArgs is int vector of repeats at the beginning and last element in iArgs is dimension +CUSTOM_OP_IMPL(repeat, 1, 1, true, 0, -1) { - x->repeat(block.getIArguments()->back(), *ret); + auto input = INPUT_VARIABLE(0); + auto output = OUTPUT_VARIABLE(0); - return Status::OK(); - } + std::vector repeats = *block.getIArguments(); - DECLARE_TYPES(repeat) { - getOpDescriptor() - ->setAllowedInputTypes(nd4j::DataType::ANY) - ->setSameMode(true); - } - - DECLARE_SHAPE_FN(repeat) { - - auto x = INPUT_VARIABLE(0); - auto argumets = block.getIArguments(); - int argsSize = argumets->size(); - int dimension = (*argumets)[argsSize-1]; - auto repeats = *argumets; - repeats.pop_back(); - - auto outShape = ShapeUtils::evalRepeatShape(dimension, ArrayUtils::toLongVector(repeats), *x); - return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(x->dataType(), x->ordering(), outShape))); - } - } + const int axis = repeats.back() < 0 ? repeats.back() + input->rankOf() : repeats.back(); + + repeats.pop_back(); + + REQUIRE_TRUE(0 <= axis && axis < input->rankOf(), 0, "CUSTOM REPEAT OP: wrong axis argument it should be less then input array rank %i, but got %i instead !", input->rankOf(), axis); + + REQUIRE_TRUE(repeats.size() == 1 || repeats.size() == input->sizeAt(axis), 0, "CUSTOM REPEAT OP: wrong axis argument, size of repeats vector must be 1 or equal to dimension at given axis, but got repeats.size = %i and axis = %i !", repeats.size(), axis); + + input->repeat(axis, repeats, *output); + + return Status::OK(); +} + +DECLARE_TYPES(repeat) { + getOpDescriptor() + ->setAllowedInputTypes(nd4j::DataType::ANY) + ->setSameMode(true); +} + +DECLARE_SHAPE_FN(repeat) { + + auto input = INPUT_VARIABLE(0); + + std::vector repeats = *block.getIArguments(); + + const int axis = repeats.back() < 0 ? repeats.back() + input->rankOf() : repeats.back(); + + repeats.pop_back(); + + auto outShape = ShapeUtils::evalRepeatShape(axis, repeats, *input); + + return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(input->dataType(), input->ordering(), outShape))); + +} +} } #endif \ No newline at end of file diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp index 9b96f34d2..a92e6713b 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp @@ -1263,12 +1263,16 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c } } - void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector reps) { - BUILD_SINGLE_SELECTOR(gradI.dataType(), tileBP_, (gradO, gradI, reps), FLOAT_TYPES); - } +void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector reps) { + BUILD_SINGLE_SELECTOR(gradI.dataType(), tileBP_, (gradO, gradI, reps), FLOAT_TYPES); +} + + +BUILD_SINGLE_TEMPLATE(template void tileBP_, (const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector reps), FLOAT_TYPES); + + - BUILD_SINGLE_TEMPLATE(template void tileBP_, (const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector reps), FLOAT_TYPES); } } diff --git a/libnd4j/include/ops/declarable/helpers/transforms.h b/libnd4j/include/ops/declarable/helpers/transforms.h index 1781c2f21..82ff4a73e 100644 --- a/libnd4j/include/ops/declarable/helpers/transforms.h +++ b/libnd4j/include/ops/declarable/helpers/transforms.h @@ -73,7 +73,6 @@ namespace helpers { void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis); void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArray& gradI /*output*/, const std::vector reps); - } } } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp index b731a37b4..04f9a7570 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp @@ -1782,40 +1782,6 @@ TEST_F(DeclarableOpsTests1, Reshape7){ } -////////////////////////////////////////////////////////////////////// -TEST_F(DeclarableOpsTests1, Repeat1) { - - float eBuffer[8] = {1.0,2.0,1.0,2.0,3.0,4.0,3.0,4.0}; - Nd4jLong eShape[8] = {2, 4, 2, 2, 1, 0, 1, 99}; - ArrayOptions::setDataType(eShape, nd4j::DataType::FLOAT32); - auto x = NDArrayFactory::create_('c', {2, 2}); - auto exp = new NDArray(eBuffer, eShape); - for (int e = 0; e < x->lengthOf(); e++) - x->p(e, e + 1); - - auto variableSpace = new VariableSpace(); - variableSpace->putVariable(-1, x); - variableSpace->putVariable(1, new Variable()); - - auto block = new Context(1, variableSpace, false); - block->fillInputs({-1}); - std::vector* arguments = block->getIArguments(); - *arguments = {2}; // set repeats - arguments->push_back(0); // set dimension - - nd4j::ops::repeat repeat; - - Nd4jStatus status = repeat.execute(block); - ASSERT_EQ(ND4J_STATUS_OK, status); - auto result = variableSpace->getVariable(block->getNodeId())->getNDArray(); - - ASSERT_TRUE(exp->equalsTo(result)); - - delete exp; - delete block; - delete variableSpace; -} - ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests1, Transpose1) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp index 861bb283a..2d8311828 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests14.cpp @@ -450,3 +450,93 @@ TEST_F(DeclarableOpsTests14, test_empty_tanh_5) { delete result; } + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, repeat_1) { + + NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6}); + NDArray e('c', {4, 3}, {1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}); + + nd4j::ops::repeat op; + auto result = op.execute({&x}, {}, {2, 0}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(e.isSameShape(z)); + ASSERT_TRUE(e.equalsTo(z)); + + delete result; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, repeat_2) { + + NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6}); + NDArray e('c', {2, 6}, {1, 1, 2, 2, 3, 3,4, 4, 5, 5, 6, 6}); + + nd4j::ops::repeat op; + auto result = op.execute({&x}, {}, {2, 1}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(e.isSameShape(z)); + ASSERT_TRUE(e.equalsTo(z)); + + delete result; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, repeat_3) { + + NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6}); + NDArray e('c', {2, 6}, {1, 2, 2, 3, 3, 3,4, 5, 5, 6, 6, 6}); + + nd4j::ops::repeat op; + auto result = op.execute({&x}, {}, {1,2,3, 1}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(e.isSameShape(z)); + ASSERT_TRUE(e.equalsTo(z)); + + delete result; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, repeat_4) { + + NDArray x('c', {2, 3}, {1, 2, 3, 4, 5, 6}); + NDArray e('c', {7, 3}, {1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6, 4, 5, 6, 4, 5, 6}); + + nd4j::ops::repeat op; + auto result = op.execute({&x}, {}, {3,4, 0}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(e.isSameShape(z)); + ASSERT_TRUE(e.equalsTo(z)); + + delete result; +} + +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests14, repeat_5) { + + NDArray x('c', {2, 3, 4}, {1, 2, 3, 4, 5, 6, 7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24}); + NDArray e('c', {2, 4, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 17, 18, 19, 20, 21, 22, 23, 24}); + + nd4j::ops::repeat op; + auto result = op.execute({&x}, {}, {1,2,1, 1}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_TRUE(e.isSameShape(z)); + ASSERT_TRUE(e.equalsTo(z)); + + delete result; +} diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp index 523021e0d..0bed375cc 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp @@ -39,20 +39,6 @@ public: } }; -TEST_F(DeclarableOpsTests16, test_repeat_119) { - auto x = NDArrayFactory::create('c', {2, 3}, {1, 2, 3, 4, 5, 6}); - auto e = NDArrayFactory::create('c', {4, 3}, {1, 2, 3, 1, 2, 3, 4, 5, 6, 4, 5, 6}); - - nd4j::ops::repeat op; - auto result = op.execute({&x}, {}, {2, 0}); - ASSERT_EQ(Status::OK(), result->status()); - - auto z = result->at(0); - ASSERT_EQ(e, *z); - - delete result; -} - TEST_F(DeclarableOpsTests16, test_scatter_update_119) { auto x = NDArrayFactory::create('c', {3}, {1, 1, 1}); auto y = NDArrayFactory::create(0); diff --git a/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp b/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp index 18b849d53..d271048a9 100644 --- a/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/MultiDataTypeTests.cpp @@ -196,7 +196,7 @@ TEST_F(MultiDataTypeTests, ndarray_repeat_test1) { NDArray y('c', {2, 4}, nd4j::DataType::HALF); NDArray exp('c', {2, 4}, {0.5, 0.5, 1.5, 1.5, 2.5, 2.5, 3.5, 3.5}, nd4j::DataType::HALF); - x.repeat(1, y); + x.repeat(1, {2}, y); ASSERT_EQ(y, exp); } diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp index 9af4a6bb3..75608f2bc 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests.cpp @@ -324,7 +324,7 @@ TEST_F(NDArrayTest, TestRepeat2) { auto rep = exp->dup(); rep->assign(0.); - array->repeat(0, *rep); + array->repeat(0, {2}, *rep); //rep->printIndexedBuffer("Repeated"); ASSERT_EQ(4, rep->sizeAt(0));