From d9ab299759ee7d2f6bd3db7c65ff75d37274ec7d Mon Sep 17 00:00:00 2001 From: raver119 Date: Wed, 21 Aug 2019 15:05:47 +0300 Subject: [PATCH] [WIP] Minor fixes (#140) * - Tile java shape fn removed - Tile 0 validation added - scatter_upd test Signed-off-by: raver119 * additional tile validation Signed-off-by: raver119 * - provide vector case in cuda scatter op Signed-off-by: Yurii * cpu ismax view fix Signed-off-by: raver119 * exp Signed-off-by: raver119 * cuda ismax fix Signed-off-by: raver119 --- libnd4j/blas/cpu/NDArray.cpp | 7 +++- libnd4j/blas/cuda/NDArray.cu | 7 +++- .../cuda/specials/fillDimensionalIsMax.cu | 8 ++--- .../declarable/generic/transforms/tile.cpp | 8 ++++- .../ops/declarable/helpers/cpu/ismax.cpp | 23 +++++++++++-- .../ops/declarable/helpers/cuda/ismax.cu | 2 +- .../ops/declarable/helpers/cuda/scatter.cu | 2 +- .../layers_tests/DeclarableOpsTests16.cpp | 17 ++++++++++ .../layers_tests/JavaInteropTests.cpp | 28 ++++++++++++++++ .../nd4j/linalg/api/ops/impl/shape/Tile.java | 32 ------------------- 10 files changed, 89 insertions(+), 45 deletions(-) diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp index 2a843f956..d6c82ea86 100644 --- a/libnd4j/blas/cpu/NDArray.cpp +++ b/libnd4j/blas/cpu/NDArray.cpp @@ -231,7 +231,8 @@ void* NDArray::getSpecialBuffer() const { // change an array by repeating it the number of times given by reps. NDArray NDArray::tile(const std::vector& reps) const { const int repsSize = reps.size(); - int product = 1; + + Nd4jLong product = 1; for(const auto& item : reps) product *= item; if(product == 0) @@ -286,6 +287,10 @@ NDArray NDArray::tile(const std::vector& reps) const { // change an array by repeating it the number of times given by reps. void NDArray::tile(const std::vector& reps, NDArray& target) const { + auto repProd = shape::prodLong(reps.data(), reps.size()); + if (repProd < 1) + throw std::runtime_error("NDArray::tile: reps can't contain 0s"); + // evaluate true tile shapeInfo for comparison with target shapeInfo auto newShapeInfo = ShapeUtils::evalTileShapeInfo(*this, reps, getContext()->getWorkspace()); if(!shape::equalsSoft(newShapeInfo, target.getShapeInfo())) { diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu index 38b6599bf..e80e6dddd 100644 --- a/libnd4j/blas/cuda/NDArray.cu +++ b/libnd4j/blas/cuda/NDArray.cu @@ -312,7 +312,8 @@ NDArray NDArray::tile(const std::vector& reps) const { Nd4jLong product = 1; for(const auto& item : reps) product *= item; - if(product == 0) + + if(product < 1) throw std::runtime_error("NDArray::tile method: one of the elements in reps array is zero !"); int rankOld = rankOf(); @@ -351,6 +352,10 @@ NDArray NDArray::tile(const std::vector& reps) const { // change an array by repeating it the number of times given by reps. void NDArray::tile(const std::vector& reps, NDArray& target) const { + auto repProd = shape::prodLong(reps.data(), reps.size()); + if (repProd < 1) + throw std::runtime_error("NDArray::tile: reps can't contain 0s"); + // evaluate true tile shapeInfo for comparison with target shapeInfo auto newShapeInfo = ShapeUtils::evalTileShapeInfo(*this, reps, getContext()->getWorkspace()); if(!shape::equalsSoft(newShapeInfo, target.getShapeInfo())) { diff --git a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu index abcc3fb7c..70da24715 100644 --- a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu +++ b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu @@ -48,18 +48,16 @@ namespace nd4j { for (int r = blockIdx.x; r < numTads; r += gridDim.x) { auto tadOffsetForBlock = tadOffsets[r]; - - int highestElement = (int) dX[r]; + auto highestElement = dX[r]; if (dimensionLength > 1 || tadEWS < 1) { - for (int e = threadIdx.x; e < tadLength; e += blockDim.x) { - + for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) { auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo, tadLength); dZ[xOffset] = (e == highestElement ? (T) 1 : (T) 0); } } else { - for (int e = threadIdx.x; e < tadLength; e += blockDim.x) { + for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) { // so, we just set dZ[e] for each TAD. Sure, e should be replaced with auto idx = tadOffsetForBlock + (e * tadEWS); dZ[idx] = (e == highestElement ? (T) 1 : (T) 0); diff --git a/libnd4j/include/ops/declarable/generic/transforms/tile.cpp b/libnd4j/include/ops/declarable/generic/transforms/tile.cpp index bcc08eee4..8ef1032d5 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/tile.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/tile.cpp @@ -50,6 +50,9 @@ CUSTOM_OP_IMPL(tile, 1, 1, false, 0, -2) { else { REQUIRE_TRUE(false, 0, "TILE op: this op requires repeats vector, either as IArgs or second array with length equal to rank of input array to be tiled !"); } + + auto repProd = shape::prodLong(reps.data(), reps.size()); + REQUIRE_TRUE(repProd > 0, 0, "TILE op: reps can't contain 0s"); input->tile(reps, *output); @@ -81,7 +84,10 @@ DECLARE_SHAPE_FN(tile) { } else { REQUIRE_TRUE(false, 0, "TILE op: this op requires repeats vector, either as IArgs or second array with length equal to rank of input array to be tiled !"); - } + } + + auto repProd = shape::prodLong(reps.data(), reps.size()); + REQUIRE_TRUE(repProd > 0, 0, "TILE op: reps can't contain 0s"); std::vector shape(inRank); for (int e = 0; e < shape::rank(inShape); e++) diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp index 330e23d5b..0a4ee2fd7 100644 --- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp +++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp @@ -125,9 +125,12 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector //to the back. //permuted version of the input shape info for setting up the tad problem auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), const_cast(dimensions.data()), dimensionsLength); + auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), const_cast(dimensions.data()), dimensionsLength); + auto tadShapeShapeInfo = tadPack.primaryShapeInfo(); auto tadOffsets = tadPack.primaryOffsets(); + auto zOfsets = tadPackZ.platformOffsets(); int tadLength = shape::length(tadShapeShapeInfo); int tads = tadPack.numberOfTads(); @@ -137,7 +140,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector num_threads = nd4j::math::nd4j_min(num_threads, omp_get_max_threads()); auto tadEWS = shape::elementWiseStride(tadShapeShapeInfo); - auto zEWS = tadEWS; + auto zEWS = shape::elementWiseStride(tadPackZ.primaryShapeInfo()); int span = (tads / num_threads) + 8; @@ -151,7 +154,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector for (int r = start; r < end; r++) { if (tadEWS > 0 && zEWS > 0 && dimensionsLength == 1) { auto rX = const_cast(input)->bufferAsT() + tadOffsets[r]; - auto rZ = output->bufferAsT() + tadOffsets[r]; + auto rZ = output->bufferAsT() + zOfsets[r]; auto maxValue = rX[0]; int maxIdx = 0; @@ -168,7 +171,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector rZ[i] = maxIdx == i ? (Z) 1 : (Z) 0; } } - else { + else if (tadEWS > 1 && zEWS > 1) { for (int i = 0; i < tadLength; i++) { if (rX[i * tadEWS] > maxValue) { maxIdx = i; @@ -180,6 +183,20 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector for (int i = 0; i < tadLength; i++) { rZ[i * zEWS] = maxIdx == i ? (Z) 1 : (Z) 0; } + } else { + for (int i = 0; i < tadLength; i++) { + auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo, tadLength); + if (rX[xOffset] > maxValue) { + maxIdx = i; + maxValue = rX[xOffset]; + } + } + + PRAGMA_OMP_SIMD + for (int i = 0; i < tadLength; i++) { + auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo(), tadLength); + rZ[zOffset] = maxIdx == i ? (Z) 1 : (Z) 0; + } } } else { diff --git a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu index 26ed9780c..bc7ea1caa 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu @@ -62,7 +62,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray* int dimensionLength = dimensions.size(); std::vector copy(dimensions); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), copy.data(), copy.size()); + auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size()); auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu index 43480f75d..ec0d304df 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu @@ -183,7 +183,7 @@ __global__ static void scatterLockCuda(const int opCode, __shared__ bool vectorCase; if(threadIdx.x == 0) - vectorCase = yTadLen == xLen && shape::rank(xShapeInfo) == 1; + vectorCase = yTadLen == xLen && shape::rank(xShapeInfo) <= 1; __syncthreads(); for (int e = 0; e < xLen; e++) { diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp index 5b17b684a..523021e0d 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp @@ -52,3 +52,20 @@ TEST_F(DeclarableOpsTests16, test_repeat_119) { delete result; } + +TEST_F(DeclarableOpsTests16, test_scatter_update_119) { + auto x = NDArrayFactory::create('c', {3}, {1, 1, 1}); + auto y = NDArrayFactory::create(0); + auto w = NDArrayFactory::create(3.0f); + auto e = NDArrayFactory::create('c', {3}, {3.f, 1.f, 1.f}); + + nd4j::ops::scatter_upd op; + auto result = op.execute({&x, &y, &w}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_EQ(e, *z); + + delete result; +} \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp index 43c6c45df..1661bd99d 100644 --- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp +++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp @@ -1161,6 +1161,34 @@ TEST_F(JavaInteropTests, test_bfloat16_rng) { ASSERT_TRUE(z.sumNumber().e(0) > 0); } +TEST_F(JavaInteropTests, test_ismax_view) { + auto original = NDArrayFactory::create('c', {2, 3, 40}); + auto v = original.subarray({NDIndex::all(), NDIndex::all(), NDIndex::interval(0, 40, 2)}); + v->assign(1.0); + + auto e = v->ulike(); + auto t = e.tensorAlongDimension(0, {0, 1}); + t->assign(1.0); + + auto z = v->ulike(); + + + Nd4jLong iArgs[] = {2L, 0L}; + Context ctx(1); + ctx.setInputArray(0, v->buffer(), v->shapeInfo(), v->specialBuffer(), v->specialShapeInfo()); + ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo()); + ctx.setIArguments(iArgs, 1); + + nd4j::ops::ismax op; + op.execute(&ctx); + + z.printIndexedBuffer("z"); + ASSERT_EQ(e, z); + + delete v; + delete t; +} + /* TEST_F(JavaInteropTests, Test_Results_Conversion_1) { auto pl = nd4j::graph::readFlatBuffers("./resources/gru_dynamic_mnist.fb"); diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java index 81187d5ad..996c70b97 100644 --- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java +++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java @@ -103,38 +103,6 @@ public class Tile extends DynamicCustomOp { return ret; } - @Override - public List calculateOutputShape() { - if(inputArguments.size() == 0) - return Collections.emptyList(); - - /** - * This op is special case: we can't infer its shape before both inputs are available. - * So if reps argument is full of 0.0s - we skip shape inference - * - * And during actual op invocation both inputs should be available due to topo sort - */ - if (is_static_reps) - return Nd4j.getExecutioner().calculateOutputShape(this); - - if (inputArguments().length < 2) - return Collections.emptyList(); - - val array = inputArguments()[1]; - - // FIXME: int cast - val reps = new long[(int) array.length()]; - - for (int e = 0; e < reps.length; e++) - reps[e] = (int) array.getDouble(e); - - if (ArrayUtil.prodLong(reps) == 0) - return Collections.emptyList(); - else - return Nd4j.getExecutioner().calculateOutputShape(this); - } - - @Override public String opName() { return "tile";