From d9ab299759ee7d2f6bd3db7c65ff75d37274ec7d Mon Sep 17 00:00:00 2001
From: raver119 <raver119@gmail.com>
Date: Wed, 21 Aug 2019 15:05:47 +0300
Subject: [PATCH] [WIP] Minor fixes (#140)

* - Tile java shape fn removed
- Tile 0 validation added
- scatter_upd test

Signed-off-by: raver119 <raver119@gmail.com>

* additional tile validation

Signed-off-by: raver119 <raver119@gmail.com>

* - provide vector case in cuda scatter op

Signed-off-by: Yurii <yurii@skymind.io>

* cpu ismax view fix

Signed-off-by: raver119 <raver119@gmail.com>

* exp

Signed-off-by: raver119 <raver119@gmail.com>

* cuda ismax fix

Signed-off-by: raver119 <raver119@gmail.com>
---
 libnd4j/blas/cpu/NDArray.cpp                  |  7 +++-
 libnd4j/blas/cuda/NDArray.cu                  |  7 +++-
 .../cuda/specials/fillDimensionalIsMax.cu     |  8 ++---
 .../declarable/generic/transforms/tile.cpp    |  8 ++++-
 .../ops/declarable/helpers/cpu/ismax.cpp      | 23 +++++++++++--
 .../ops/declarable/helpers/cuda/ismax.cu      |  2 +-
 .../ops/declarable/helpers/cuda/scatter.cu    |  2 +-
 .../layers_tests/DeclarableOpsTests16.cpp     | 17 ++++++++++
 .../layers_tests/JavaInteropTests.cpp         | 28 ++++++++++++++++
 .../nd4j/linalg/api/ops/impl/shape/Tile.java  | 32 -------------------
 10 files changed, 89 insertions(+), 45 deletions(-)
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp
index 2a843f956..d6c82ea86 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@@ -231,7 +231,8 @@ void* NDArray::getSpecialBuffer() const {
 // change an array by repeating it the number of times given by reps.
 NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     const int repsSize = reps.size();
-    int product = 1;
+
+    Nd4jLong product = 1;
     for(const auto& item : reps)
         product *= item;
     if(product == 0)
@@ -286,6 +287,10 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
 // change an array by repeating it the number of times given by reps.
 void NDArray::tile(const std::vector<Nd4jLong>& reps, NDArray& target) const {
 
+    auto repProd = shape::prodLong(reps.data(), reps.size());
+    if (repProd < 1)
+        throw std::runtime_error("NDArray::tile: reps can't contain 0s");
+
     // evaluate true tile shapeInfo for comparison with target shapeInfo
     auto newShapeInfo = ShapeUtils::evalTileShapeInfo(*this, reps, getContext()->getWorkspace());
     if(!shape::equalsSoft(newShapeInfo, target.getShapeInfo()))  {
diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu
index 38b6599bf..e80e6dddd 100644
--- a/libnd4j/blas/cuda/NDArray.cu
+++ b/libnd4j/blas/cuda/NDArray.cu
@@ -312,7 +312,8 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     Nd4jLong product = 1;
     for(const auto& item : reps)
         product *= item;
-    if(product == 0)
+
+    if(product < 1)
         throw std::runtime_error("NDArray::tile method: one of the elements in reps array is zero !");
 
     int rankOld = rankOf();
@@ -351,6 +352,10 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
 // change an array by repeating it the number of times given by reps.
 void NDArray::tile(const std::vector<Nd4jLong>& reps, NDArray& target) const {
 
+    auto repProd = shape::prodLong(reps.data(), reps.size());
+    if (repProd < 1)
+        throw std::runtime_error("NDArray::tile: reps can't contain 0s");
+
     // evaluate true tile shapeInfo for comparison with target shapeInfo
     auto newShapeInfo = ShapeUtils::evalTileShapeInfo(*this, reps, getContext()->getWorkspace());
     if(!shape::equalsSoft(newShapeInfo, target.getShapeInfo()))  {
diff --git a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
index abcc3fb7c..70da24715 100644
--- a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
+++ b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
@@ -48,18 +48,16 @@ namespace nd4j {
 
         for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
             auto tadOffsetForBlock = tadOffsets[r];
-
-            int highestElement = (int) dX[r];
+            auto highestElement = dX[r];
 
             if (dimensionLength > 1 || tadEWS < 1) {
 
-                for (int e = threadIdx.x; e < tadLength; e += blockDim.x) {
-
+                for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) {
                     auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo, tadLength);
                     dZ[xOffset] = (e == highestElement ? (T) 1 : (T) 0);
                 }
             } else {
-                for (int e = threadIdx.x; e < tadLength; e += blockDim.x) {
+                for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) {
                     // so, we just set dZ[e] for each TAD. Sure, e should be replaced with
                     auto idx = tadOffsetForBlock + (e * tadEWS);
                     dZ[idx] = (e == highestElement ? (T) 1 : (T) 0);
diff --git a/libnd4j/include/ops/declarable/generic/transforms/tile.cpp b/libnd4j/include/ops/declarable/generic/transforms/tile.cpp
index bcc08eee4..8ef1032d5 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/tile.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/tile.cpp
@@ -50,6 +50,9 @@ CUSTOM_OP_IMPL(tile, 1, 1, false, 0, -2) {
     else {
         REQUIRE_TRUE(false, 0, "TILE op: this op requires repeats vector, either as IArgs or second array with length equal to rank of input array to be tiled !");
     }
+
+    auto repProd = shape::prodLong(reps.data(), reps.size());
+    REQUIRE_TRUE(repProd > 0, 0, "TILE op: reps can't contain 0s");
             
     input->tile(reps, *output);
 
@@ -81,7 +84,10 @@ DECLARE_SHAPE_FN(tile) {
     }
     else {
         REQUIRE_TRUE(false, 0, "TILE op: this op requires repeats vector, either as IArgs or second array with length equal to rank of input array to be tiled !");
-    }    
+    }
+
+    auto repProd = shape::prodLong(reps.data(), reps.size());
+    REQUIRE_TRUE(repProd > 0, 0, "TILE op: reps can't contain 0s");
     
     std::vector<Nd4jLong> shape(inRank);
     for (int e = 0; e < shape::rank(inShape); e++)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index 330e23d5b..0a4ee2fd7 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -125,9 +125,12 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
         //to the back.
         //permuted version of the input shape info for setting up the tad problem
         auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), const_cast<int*>(dimensions.data()), dimensionsLength);
+        auto tadPackZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), const_cast<int*>(dimensions.data()), dimensionsLength);
+
 
         auto tadShapeShapeInfo = tadPack.primaryShapeInfo();
         auto tadOffsets = tadPack.primaryOffsets();
+        auto zOfsets = tadPackZ.platformOffsets();
 
         int tadLength = shape::length(tadShapeShapeInfo);
         int tads = tadPack.numberOfTads();
@@ -137,7 +140,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
         num_threads = nd4j::math::nd4j_min<int>(num_threads, omp_get_max_threads());
 
         auto tadEWS = shape::elementWiseStride(tadShapeShapeInfo);
-        auto zEWS = tadEWS;
+        auto zEWS = shape::elementWiseStride(tadPackZ.primaryShapeInfo());
 
         int span = (tads / num_threads) + 8;
 
@@ -151,7 +154,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
             for (int r = start; r < end; r++) {
                 if (tadEWS > 0 && zEWS > 0 && dimensionsLength == 1) {
                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
-                    auto rZ = output->bufferAsT<Z>() + tadOffsets[r];
+                    auto rZ = output->bufferAsT<Z>() + zOfsets[r];
 
                     auto maxValue = rX[0];
                     int maxIdx = 0;
@@ -168,7 +171,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
                             rZ[i] = maxIdx == i ? (Z) 1 : (Z) 0;
                         }
                     } 
-                    else {
+                    else if (tadEWS > 1 && zEWS > 1) {
                         for (int i = 0; i < tadLength; i++) {
                             if (rX[i * tadEWS] > maxValue) {
                                 maxIdx = i;
@@ -180,6 +183,20 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
                         for (int i = 0; i < tadLength; i++) {
                             rZ[i * zEWS] = maxIdx == i ? (Z) 1 : (Z) 0;
                         }
+                    } else {
+                        for (int i = 0; i < tadLength; i++) {
+                            auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo, tadLength);
+                            if (rX[xOffset] > maxValue) {
+                                maxIdx = i;
+                                maxValue = rX[xOffset];
+                            }
+                        }
+
+                        PRAGMA_OMP_SIMD
+                        for (int i = 0; i < tadLength; i++) {
+                            auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo(), tadLength);
+                            rZ[zOffset] = maxIdx == i ? (Z) 1 : (Z) 0;
+                        }
                     }
                 } 
                 else {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
index 26ed9780c..bc7ea1caa 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/ismax.cu
@@ -62,7 +62,7 @@ static void ismax_(nd4j::LaunchContext * context, const NDArray* input, NDArray*
         int dimensionLength = dimensions.size();
         std::vector<int> copy(dimensions);
 
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), copy.data(), copy.size());
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), copy.data(), copy.size());
 
         auto indexMaxArr = input->applyIndexReduce(indexreduce::IndexMax, dimensions);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index 43480f75d..ec0d304df 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -183,7 +183,7 @@ __global__ static void scatterLockCuda(const int opCode,
 
     __shared__ bool vectorCase;
     if(threadIdx.x == 0)
-        vectorCase = yTadLen == xLen && shape::rank(xShapeInfo) == 1;
+        vectorCase = yTadLen == xLen && shape::rank(xShapeInfo) <= 1;
     __syncthreads();
 
     for (int e = 0; e < xLen; e++) {
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
index 5b17b684a..523021e0d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
@@ -52,3 +52,20 @@ TEST_F(DeclarableOpsTests16, test_repeat_119) {
 
     delete result;
 }
+
+TEST_F(DeclarableOpsTests16, test_scatter_update_119) {
+    auto x = NDArrayFactory::create<float>('c', {3}, {1, 1, 1});
+    auto y = NDArrayFactory::create<int>(0);
+    auto w = NDArrayFactory::create<float>(3.0f);
+    auto e = NDArrayFactory::create<float>('c', {3}, {3.f, 1.f, 1.f});
+
+    nd4j::ops::scatter_upd op;
+    auto result = op.execute({&x, &y, &w}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_EQ(e, *z);
+
+    delete result;
+}
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
index 43c6c45df..1661bd99d 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
@@ -1161,6 +1161,34 @@ TEST_F(JavaInteropTests, test_bfloat16_rng) {
     ASSERT_TRUE(z.sumNumber().e<float>(0) > 0);
 }
 
+TEST_F(JavaInteropTests, test_ismax_view) {
+    auto original = NDArrayFactory::create<double>('c', {2, 3, 40});
+    auto v = original.subarray({NDIndex::all(), NDIndex::all(), NDIndex::interval(0, 40, 2)});
+    v->assign(1.0);
+
+    auto e = v->ulike();
+    auto t = e.tensorAlongDimension(0, {0, 1});
+    t->assign(1.0);
+
+    auto z = v->ulike();
+
+
+    Nd4jLong iArgs[] = {2L, 0L};
+    Context ctx(1);
+    ctx.setInputArray(0, v->buffer(), v->shapeInfo(), v->specialBuffer(), v->specialShapeInfo());
+    ctx.setOutputArray(0, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo());
+    ctx.setIArguments(iArgs, 1);
+
+    nd4j::ops::ismax op;
+    op.execute(&ctx);
+
+    z.printIndexedBuffer("z");
+    ASSERT_EQ(e, z);
+
+    delete v;
+    delete t;
+}
+
 /*
 TEST_F(JavaInteropTests, Test_Results_Conversion_1) {
     auto pl = nd4j::graph::readFlatBuffers("./resources/gru_dynamic_mnist.fb");
diff --git a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java
index 81187d5ad..996c70b97 100644
--- a/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java
+++ b/nd4j/nd4j-backends/nd4j-api-parent/nd4j-api/src/main/java/org/nd4j/linalg/api/ops/impl/shape/Tile.java
@@ -103,38 +103,6 @@ public class Tile extends DynamicCustomOp {
         return ret;
     }
 
-    @Override
-    public List<LongShapeDescriptor> calculateOutputShape() {
-        if(inputArguments.size() == 0)
-            return Collections.emptyList();
-
-        /**
-         * This op is special case: we can't infer its shape before both inputs are available.
-         * So if reps argument is full of 0.0s - we skip shape inference
-         *
-         * And during actual op invocation both inputs should be available due to topo sort
-         */
-        if (is_static_reps)
-            return Nd4j.getExecutioner().calculateOutputShape(this);
-
-        if (inputArguments().length < 2)
-            return Collections.emptyList();
-
-        val array = inputArguments()[1];
-
-        // FIXME: int cast
-        val reps = new long[(int) array.length()];
-
-        for (int e = 0; e < reps.length; e++)
-            reps[e] = (int) array.getDouble(e);
-
-        if (ArrayUtil.prodLong(reps) == 0)
-            return Collections.emptyList();
-        else
-            return Nd4j.getExecutioner().calculateOutputShape(this);
-    }
-
-
     @Override
     public String opName() {
         return "tile";