From a8dd6713aa64d34c59a3bbcd5710ceb5fb700323 Mon Sep 17 00:00:00 2001
From: Yurii Shyrma <iuriish@yahoo.com>
Date: Tue, 26 Nov 2019 19:29:09 +0200
Subject: [PATCH] Shyrma scatter (#84)

* - improve performance of scatter (no lock) ops for 1D case

Signed-off-by: Yurii <iuriish@yahoo.com>

* - improve scatter lock op performance for 1D case

Signed-off-by: Yurii <iuriish@yahoo.com>

* - add kernel for verification of input indices-array elements in scatter and scatter_nd ops

Signed-off-by: Yurii <iuriish@yahoo.com>

* - provide fast indices checking on cpu side for scatter and gather osp

Signed-off-by: Yurii <iuriish@yahoo.com>

* - apply corrections requested by pr reviewer

Signed-off-by: Yurii <iuriish@yahoo.com>
---
 libnd4j/include/helpers/shape.h               |   19 +-
 .../generic/parity_ops/scatter_add.cpp        |   10 +-
 .../generic/parity_ops/scatter_div.cpp        |   11 +-
 .../generic/parity_ops/scatter_max.cpp        |   10 +-
 .../generic/parity_ops/scatter_min.cpp        |   10 +-
 .../generic/parity_ops/scatter_mul.cpp        |   10 +-
 .../generic/parity_ops/scatter_nd.cpp         |    8 +-
 .../generic/parity_ops/scatter_nd_add.cpp     |   16 +-
 .../generic/parity_ops/scatter_nd_sub.cpp     |   18 +-
 .../generic/parity_ops/scatter_nd_update.cpp  |   16 +-
 .../generic/parity_ops/scatter_sub.cpp        |   10 +-
 .../generic/parity_ops/scatter_upd.cpp        |   10 +-
 .../declarable/generic/transforms/gather.cpp  |   49 +-
 .../generic/transforms/gatherNd.cpp           |    8 +
 .../ops/declarable/helpers/cpu/scatter.cpp    |   43 +
 .../ops/declarable/helpers/cuda/gather.cu     |    6 +-
 .../ops/declarable/helpers/cuda/gather_nd.cu  |    2 +-
 .../ops/declarable/helpers/cuda/scatter.cu    | 1313 +++++++++++------
 .../include/ops/declarable/helpers/scatter.h  |    2 +
 .../layers_tests/DeclarableOpsTests16.cpp     |   10 +
 .../layers_tests/DeclarableOpsTests2.cpp      |   30 +-
 .../layers_tests/DeclarableOpsTests5.cpp      |   50 +-
 .../layers_tests/DeclarableOpsTests6.cpp      |   21 -
 .../tests_cpu/layers_tests/ParityOpsTests.cpp |   74 +-
 24 files changed, 1180 insertions(+), 576 deletions(-)

diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index 0a581d718..12162d77c 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -533,7 +533,7 @@ namespace shape {
  * the given shape info buffer
  * represents a scalar shape
  */
-    ND4J_EXPORT _CUDA_HD int isScalar(Nd4jLong *info);
+    ND4J_EXPORT _CUDA_HD int isScalar(const Nd4jLong *info);
 
 /**
  * Returns whether
@@ -904,6 +904,7 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,  int *coords);
     ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,  uint *coords);
     ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, int *coords);
     /**
     * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
     */
@@ -2706,7 +2707,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
  * the given shape info buffer
  * represents a scalar shape
  */
-    INLINEDEF _CUDA_HD int isScalar(Nd4jLong *info) {
+    INLINEDEF _CUDA_HD int isScalar(const Nd4jLong *info) {
 
         const int rank = shape::rank(info);
 
@@ -2715,9 +2716,9 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
         if(rank == 0)
             return 1;
         if(rank == 1)
-            return shape::shapeOf(info)[0] == 1;
+            return shape::shapeOf(const_cast<Nd4jLong*>(info))[0] == 1;
         if(rank == 2)
-            return shape::shapeOf(info)[0] == 1 && shape::shapeOf(info)[1] == 1;
+            return shape::shapeOf(const_cast<Nd4jLong*>(info))[0] == 1 && shape::shapeOf(const_cast<Nd4jLong*>(info))[1] == 1;
 
         return 0;
     }
@@ -4793,6 +4794,16 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
     coords[0] = index;      // last iteration
 }
 
+//////////////////////////////////////////////////////////////////////
+INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, int *coords) {
+
+    for(uint i = rank - 1; i > 0; --i) {
+        coords[i] = index % shape[i];
+        index /= shape[i];
+    }
+    coords[0] = index;      // last iteration
+}
+
 //////////////////////////////////////////////////////////////////////
 INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims) {
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
index 0d9465b02..db4eeeff6 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
@@ -39,6 +39,7 @@ OP_IMPL(scatter_add, 3, 1, true) {
         output->assign(input);
 
     const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
     const int inRank  = input->rankOf();
     const int indRank = indices->rankOf();
@@ -71,8 +72,15 @@ OP_IMPL(scatter_add, 3, 1, true) {
         REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ADD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
     }
 
-    if (!indices->isEmpty())
+    if (!indices->isEmpty()) {
+
+        if(checkIndices) {
+            const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+            REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_ADD OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+        }
+
         helpers::scatter(block.launchContext(), pairwise::Add, *indices, *updates, *output, lock);
+    }
 
     return Status::OK();
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp
index dccc34e59..40ddbd424 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_div.cpp
@@ -39,6 +39,7 @@ namespace nd4j {
                 output->assign(input);
 
             const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+            const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
             const int inRank  = input->rankOf();
             const int indRank = indices->rankOf();
@@ -70,9 +71,15 @@ namespace nd4j {
                 REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_DIV OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
             }
 
-            if (!indices->isEmpty())
-                // ScatterHelper<T>::template scatterApply<simdOps::Divide<T>>(output, indices, updates);
+            if (!indices->isEmpty()) {
+
+                if(checkIndices) {
+                    const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+                    REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_DIV OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+                }
+
                 helpers::scatter(block.launchContext(), pairwise::Divide, *indices, *updates, *output, lock);
+            }
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp
index 5d37a71d0..4ec55f088 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_max.cpp
@@ -39,6 +39,7 @@ OP_IMPL(scatter_max, 3, 1, true) {
         output->assign(input);
 
     const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
     const int inRank  = input->rankOf();
     const int indRank = indices->rankOf();
@@ -70,8 +71,15 @@ OP_IMPL(scatter_max, 3, 1, true) {
         REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_MAX OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
     }
 
-    if (!indices->isEmpty())
+    if (!indices->isEmpty()) {
+
+        if(checkIndices) {
+            const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+            REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_MAX OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+        }
+
         helpers::scatter(block.launchContext(), pairwise::MaxPairwise, *indices, *updates, *output, lock);
+    }
 
     return Status::OK();
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp
index 1bed296f9..ea8dbf081 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_min.cpp
@@ -39,6 +39,7 @@ OP_IMPL(scatter_min, 3, 1, true) {
         output->assign(input);
 
     const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
     const int inRank  = input->rankOf();
     const int indRank = indices->rankOf();
@@ -70,8 +71,15 @@ OP_IMPL(scatter_min, 3, 1, true) {
         REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_MIN OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
     }
 
-    if (!indices->isEmpty())
+    if (!indices->isEmpty()) {
+
+        if(checkIndices) {
+            const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+            REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_MIN OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+        }
+
         helpers::scatter(block.launchContext(), pairwise::MinPairwise, *indices, *updates, *output, lock);
+    }
 
     return Status::OK();
 }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp
index 46b9f7008..4685cef5d 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_mul.cpp
@@ -35,6 +35,7 @@ namespace nd4j {
             auto output = OUTPUT_VARIABLE(0);
 
             const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+            const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
             const int inRank  = input->rankOf();
             const int indRank = indices->rankOf();
@@ -70,8 +71,15 @@ namespace nd4j {
                 REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_MUL OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
             }
 
-            if (!indices->isEmpty())
+            if (!indices->isEmpty()) {
+
+                if(checkIndices) {
+                    const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+                    REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_MUL OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+                }
+
                 helpers::scatter(block.launchContext(), pairwise::Multiply, *indices, *updates, *output, lock);
+            }
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp
index de4839d3e..4c3884e07 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd.cpp
@@ -35,6 +35,7 @@ namespace ops {
         auto output = OUTPUT_VARIABLE(0);
 
         const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+        const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
         const int indRank   = indices->rankOf();
         const int updRank   = updates->rankOf();
@@ -53,6 +54,11 @@ namespace ops {
         std::move(std::begin(outShape) + indices->sizeAt(-1), std::end(outShape), std::back_inserter(expectedUpdShape));
         REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ND OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
 
+        if(checkIndices) {
+            const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output);
+            REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_ND OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+        }
+
         // initial zeroing of output
         *output = 0;
 
@@ -73,7 +79,7 @@ namespace ops {
         DECLARE_SHAPE_FN(scatter_nd) {
 
             auto shape = INPUT_VARIABLE(2);
-            auto updShapeInfo = inputShape->at(1);            
+            auto updShapeInfo = inputShape->at(1);
 
             Nd4jLong *outShapeInfo;
             ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(shape->lengthOf()), Nd4jLong);
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp
index 5679b22ec..43c2c66ed 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_add.cpp
@@ -34,25 +34,31 @@ OP_IMPL(scatter_nd_add, 3, 1, true) {
 
     auto output = OUTPUT_VARIABLE(0);
 
-    bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
     const int inRank  = input->rankOf();
     const int indRank = indices->rankOf();
     const int updRank = updates->rankOf();
 
     const Nd4jLong indLastDim = indices->sizeAt(-1);
-    
+
     REQUIRE_TRUE(indLastDim <= inRank, 0, "SCATTER_ND_ADD OP: the last dimension of indices array must be <= input_array_rank, but got %i instead !", indLastDim);
     REQUIRE_TRUE(updRank == (indRank - 1 + inRank - indLastDim), 0, "SCATTER_ND_ADD OP: the equality updates_rank = (indices_rank - 1 + input_rank - last_indices_dimension) must be true for input arrays, but got instead: updates_rank = %i, indices_rank = %i, last_indices_dimension = %i !", updRank, indRank, indLastDim);
 
     std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
     std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
-    std::vector<Nd4jLong> indShape = indices->getShapeAsVector();    
-    std::vector<Nd4jLong> expectedUpdShape(std::begin(indShape), std::end(indShape) - 1);     
+    std::vector<Nd4jLong> indShape = indices->getShapeAsVector();
+    std::vector<Nd4jLong> expectedUpdShape(std::begin(indShape), std::end(indShape) - 1);
     if(inRank > indLastDim)
-        std::move(std::begin(inShape) + indLastDim, std::end(inShape), std::back_inserter(expectedUpdShape));        
+        std::move(std::begin(inShape) + indLastDim, std::end(inShape), std::back_inserter(expectedUpdShape));
     REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ND_ADD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
 
+    if(checkIndices) {
+        const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output);
+        REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_ND_ADD OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+    }
+
     if (!block.isInplace())
         output->assign(input);
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp
index 7b0d0e1ad..eb4768f86 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_sub.cpp
@@ -34,25 +34,31 @@ OP_IMPL(scatter_nd_sub, 3, 1, true) {
 
     auto output = OUTPUT_VARIABLE(0);
 
-    bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
     const int inRank  = input->rankOf();
     const int indRank = indices->rankOf();
     const int updRank = updates->rankOf();
 
     const Nd4jLong indLastDim = indices->sizeAt(-1);
-    
+
     REQUIRE_TRUE(indLastDim <= inRank, 0, "SCATTER_ND_SUB OP: the last dimension of indices array must be <= input_array_rank, but got %i instead !", indLastDim);
     REQUIRE_TRUE(updRank == (indRank - 1 + inRank - indLastDim), 0, "SCATTER_ND_SUB OP: the equality updates_rank = (indices_rank - 1 + input_rank - last_indices_dimension) must be true for input arrays, but got instead: updates_rank = %i, indices_rank = %i, last_indices_dimension = %i !", updRank, indRank, indLastDim);
 
     std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
     std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
-    std::vector<Nd4jLong> indShape = indices->getShapeAsVector();    
-    std::vector<Nd4jLong> expectedUpdShape(std::begin(indShape), std::end(indShape) - 1);     
+    std::vector<Nd4jLong> indShape = indices->getShapeAsVector();
+    std::vector<Nd4jLong> expectedUpdShape(std::begin(indShape), std::end(indShape) - 1);
     if(inRank > indLastDim)
-        std::move(std::begin(inShape) + indLastDim, std::end(inShape), std::back_inserter(expectedUpdShape));        
+        std::move(std::begin(inShape) + indLastDim, std::end(inShape), std::back_inserter(expectedUpdShape));
     REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ND_SUB OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
 
+    if(checkIndices) {
+        const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output);
+        REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_ND_SUB OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+    }
+
     if (!block.isInplace())
         output->assign(input);
 
@@ -62,7 +68,7 @@ OP_IMPL(scatter_nd_sub, 3, 1, true) {
 }
 
     DECLARE_TYPES(scatter_nd_sub) {
-        getOpDescriptor()                
+        getOpDescriptor()
             ->setAllowedInputTypes(0, {ALL_INTS, ALL_FLOATS})
             ->setAllowedInputTypes(1, {ALL_INTS})
             ->setAllowedInputTypes(2, {ALL_INTS, ALL_FLOATS})
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp
index e973ff160..e6bfb8703 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_nd_update.cpp
@@ -34,25 +34,31 @@ OP_IMPL(scatter_nd_update, 3, 1, true) {
 
     auto output = OUTPUT_VARIABLE(0);
 
-    bool lock = block.getBArguments()->empty() ? true : B_ARG(0);
+    const bool lock = block.getBArguments()->empty() ? true : B_ARG(0);
+    const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
     const int inRank  = input->rankOf();
     const int indRank = indices->rankOf();
     const int updRank = updates->rankOf();
 
     const Nd4jLong indLastDim = indices->sizeAt(-1);
-    
+
     REQUIRE_TRUE(indLastDim <= inRank, 0, "SCATTER_ND_UPDATE OP: the last dimension of indices array must be <= input_array_rank, but got %i instead !", indLastDim);
     REQUIRE_TRUE(updRank == (indRank - 1 + inRank - indLastDim), 0, "SCATTER_ND_UPDATE OP: the equality updates_rank = (indices_rank - 1 + input_rank - last_indices_dimension) must be true for input arrays, but got instead: updates_rank = %i, indices_rank = %i, last_indices_dimension = %i !", updRank, indRank, indLastDim);
 
     std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
     std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
-    std::vector<Nd4jLong> indShape = indices->getShapeAsVector();    
-    std::vector<Nd4jLong> expectedUpdShape(std::begin(indShape), std::end(indShape) - 1);     
+    std::vector<Nd4jLong> indShape = indices->getShapeAsVector();
+    std::vector<Nd4jLong> expectedUpdShape(std::begin(indShape), std::end(indShape) - 1);
     if(inRank > indLastDim)
         std::move(std::begin(inShape) + indLastDim, std::end(inShape), std::back_inserter(expectedUpdShape));
     REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ND_UPDATE OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
 
+    if(checkIndices) {
+        const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output);
+        REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_ND_UPDATE OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+    }
+
     if (!block.isInplace())
         output->assign(input);
 
@@ -62,7 +68,7 @@ OP_IMPL(scatter_nd_update, 3, 1, true) {
 }
 
     DECLARE_TYPES(scatter_nd_update) {
-        getOpDescriptor()                
+        getOpDescriptor()
             ->setAllowedInputTypes(0, {ALL_INTS, ALL_FLOATS})
             ->setAllowedInputTypes(1, {ALL_INTS})
             ->setAllowedInputTypes(2, {ALL_INTS, ALL_FLOATS})
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp
index cf3745236..1971b346f 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_sub.cpp
@@ -38,6 +38,7 @@ namespace nd4j {
                 output->assign(input);
 
             const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+            const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
             const int inRank  = input->rankOf();
             const int indRank = indices->rankOf();
@@ -70,9 +71,16 @@ namespace nd4j {
                 REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_SUB OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
             }
 
-            if (!indices->isEmpty())
+            if (!indices->isEmpty()) {
+
+                if(checkIndices) {
+                    const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+                    REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_SUB OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+                }
+
                 // ScatterHelper<T>::template scatterApply<simdOps::Subtract<T>>(output, indices, updates);
                 helpers::scatter(block.launchContext(), pairwise::Subtract, *indices, *updates, *output, lock);
+            }
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp
index 55076e51e..081d1cd76 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_upd.cpp
@@ -37,6 +37,7 @@ namespace nd4j {
                 output->assign(input);
 
             const bool lock = block.getBArguments()->empty() ? true : B_ARG(0);
+            const bool checkIndices = block.getBArguments()->size() <= 1 ? false : B_ARG(1);
 
             const int inRank  = input->rankOf();
             const int indRank = indices->rankOf();
@@ -68,9 +69,16 @@ namespace nd4j {
                 REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_UPD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
             }
 
-            if (!indices->isEmpty())
+            if (!indices->isEmpty()) {
+
+                if(checkIndices) {
+                    const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *output, 0);
+                    REQUIRE_TRUE(numOfBadIndx == 0, 0, "SCATTER_UPD OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+                }
+
                 // ScatterHelper<T>::template scatterApply<simdOps::Copy<T>>(output, indices, updates);
                 helpers::scatter(block.launchContext(), pairwise::CopyPws, *indices, *updates, *output, lock);
+            }
 
             return Status::OK();
         }
diff --git a/libnd4j/include/ops/declarable/generic/transforms/gather.cpp b/libnd4j/include/ops/declarable/generic/transforms/gather.cpp
index 2e38f9977..61ed3bc65 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/gather.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/gather.cpp
@@ -22,7 +22,8 @@
 #if NOT_EXCLUDED(OP_gather)
 
 #include <ops/declarable/CustomOperations.h>
-#include<ops/declarable/helpers/gather.h>
+#include <ops/declarable/helpers/gather.h>
+#include <ops/declarable/helpers/scatter.h>
 
 
 namespace nd4j {
@@ -34,8 +35,10 @@ CUSTOM_OP_IMPL(gather, 1, 1, false, 0, -2) {
 
 	auto input   = INPUT_VARIABLE(0);
     auto indices = block.width() > 1 ? INPUT_VARIABLE(1) : nullptr;
-	auto output  = OUTPUT_VARIABLE(0);	
-	
+	auto output  = OUTPUT_VARIABLE(0);
+
+	const bool checkIndices = block.getBArguments()->empty() ? false : B_ARG(0);
+
 	//Edge case: empty indices -> empty output
 	if(indices != nullptr && indices->isEmpty()){
 		REQUIRE_TRUE(output->isEmpty(), 0, "Gather op: If indices are empty, output must also be empty");
@@ -47,7 +50,7 @@ CUSTOM_OP_IMPL(gather, 1, 1, false, 0, -2) {
     std::vector<int> intArgs;
     if (block.width() > 2) {
     	intArgs = INPUT_VARIABLE(2)->template asVectorT<int>();
-    } 
+    }
     else {
 		if (numOfIntArgs == 0)
 			intArgs.emplace_back(0);
@@ -64,14 +67,16 @@ CUSTOM_OP_IMPL(gather, 1, 1, false, 0, -2) {
     REQUIRE_TRUE(intArgs[0] < inputRank, 0, "GATHER op: input axis must be smaller than input array rank, but got %i and %i correspondingly!", intArgs[0], inputRank);
     REQUIRE_TRUE(indices != nullptr || numOfIntArgs > 1, 0, "GATHER op: indices should be provided either as additional input array or as IntArguments !");
 
-	if (indices != nullptr) {
-		for(int i = 0; i < indices->lengthOf(); ++i)            
-            REQUIRE_TRUE(indices->e<Nd4jLong>(i) < input->sizeAt(intArgs[0]), 0, "GATHER op: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !");
-	}
-	else {
-		for(int i = 1; i < numOfIntArgs; ++i)            
-            REQUIRE_TRUE(intArgs[i] < input->sizeAt(intArgs[0]), 0, "GATHER op: some of indexes is larger than corresponding shape of input array !");
-	}
+	if(checkIndices) {
+
+		NDArray* pIndices = indices;
+		if(indices == nullptr)
+            pIndices = new NDArray(input->ordering(), {static_cast<int>(intArgs.size()) - 1}, std::vector<double>(intArgs.begin() + 1, intArgs.end()), DataType::INT64, block.launchContext());
+        const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *pIndices, *input, intArgs[0]);
+        REQUIRE_TRUE(numOfBadIndx == 0, 0, "GATHER OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+        if(indices == nullptr)
+            delete pIndices;
+    }
 
 	helpers::gather(block.launchContext(), input, indices, output, intArgs);
 
@@ -87,7 +92,7 @@ DECLARE_TYPES(gather) {
 
 DECLARE_SHAPE_FN(gather) {
 
-	// check shape of paddings 
+	// check shape of paddings
 	auto inputShapeInfo  = inputShape->at(0);
 	Nd4jLong* outputShapeInfo = nullptr;
 
@@ -105,21 +110,21 @@ DECLARE_SHAPE_FN(gather) {
     REQUIRE_TRUE(axis < inputRank, 0, "GATHER op: input axis must be smaller than input array rank, but got %i and %i correspondingly!", axis, inputRank);
 
 	bool isEmpty = false;
-	
+
 	if (block.width() > 1) {
 		auto indicesShapeInfo = inputShape->at(1);
-    
+
     	int indicesRank = shape::rank(indicesShapeInfo);
-      
+
     	int outputRank = inputRank + indicesRank - 1;
-		
+
     	ALLOCATE(outputShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outputRank), Nd4jLong);
 
     	// fill output shapeInfo
     	outputShapeInfo[0] = outputRank;
-    	int shapeIdx = 1;    
-    	
-    	for(int i = 0; i < axis; ++i)    		
+    	int shapeIdx = 1;
+
+    	for(int i = 0; i < axis; ++i)
     		outputShapeInfo[shapeIdx++] = inputShapeInfo[i+1];
 
 		for(int i = 0; i < indicesRank; ++i)
@@ -127,7 +132,7 @@ DECLARE_SHAPE_FN(gather) {
 
     	for(int i = axis+1; i < inputRank; ++i)
     		outputShapeInfo[shapeIdx++] = inputShapeInfo[i+1];
-	} 
+	}
 	else if (block.numI() > 1) {
 
 		int indicesRank = block.numI() == 2 ? 0 : 1;
@@ -137,7 +142,7 @@ DECLARE_SHAPE_FN(gather) {
 
 		// building shape manually
 		outputShapeInfo[0] = outputRank;
-    	int shapeIdx = 1;    
+    	int shapeIdx = 1;
     	for(int i = 0; i < axis; ++i)
     		outputShapeInfo[shapeIdx++] = inputShapeInfo[i+1];
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp b/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp
index c64150c04..c889569e2 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/gatherNd.cpp
@@ -23,6 +23,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 #include<ops/declarable/helpers/transforms.h>
+#include <ops/declarable/helpers/scatter.h>
 
 namespace nd4j {
 namespace ops {
@@ -35,6 +36,8 @@ CUSTOM_OP_IMPL(gather_nd, 2, 1, false, 0, 0) {
     auto indices = INPUT_VARIABLE(1);
     auto output  = OUTPUT_VARIABLE(0);
 
+    const bool checkIndices = block.getBArguments()->empty() ? false : B_ARG(0);
+
     const int rankIn = input->rankOf();
     const int rankInd = indices->rankOf();
 
@@ -42,6 +45,11 @@ CUSTOM_OP_IMPL(gather_nd, 2, 1, false, 0, 0) {
     int lastIndDim = indices->sizeAt(-1);
     REQUIRE_TRUE(lastIndDim <= rankIn, 0, "GATHER_ND op: the last dimension of indices array must be <= rank of input array but got %i and %i correspondingly!", lastIndDim, rankIn);
 
+    if(checkIndices) {
+        const Nd4jLong numOfBadIndx = helpers::checkIndices(block.launchContext(), *indices, *input);
+        REQUIRE_TRUE(numOfBadIndx == 0, 0, "GATHER_ND OP: please check elements of indices-array, total number of wrong elements is %lld!", numOfBadIndx);
+    }
+
     helpers::gatherND(block.launchContext(), *input, *indices, *output);
 
     return Status::OK();
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index 99605e7cc..9ae191c76 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -27,6 +27,49 @@ namespace nd4j    {
 namespace ops     {
 namespace helpers {
 
+///////////////////////////////////////////////////////////////////
+// x - indices, z - input/output
+template<typename T>
+Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int axis) {
+
+    std::atomic<int64_t> numOfBadIndx{0};
+
+    const auto x = indices.bufferAsT<T>();
+
+    const auto xShapeInfo = indices.getShapeInfo();
+    const auto zShapeInfo = output.getShapeInfo();
+
+    const auto xRank = indices.rankOf();
+
+    auto func = PRAGMA_THREADS_FOR {
+
+        Nd4jLong xCoords[MAX_RANK];
+
+        for (auto i = start; i < stop; i += increment) {
+
+            shape::index2coords(i, xShapeInfo, xCoords);
+
+            const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)];
+
+            if(currentInd >= shape::sizeAt(zShapeInfo, axis == -1 ? xCoords[xRank-1] : axis)) {
+                printf("checkIndices: out of range element %lld at index %ld \n", currentInd,  i);
+                ++numOfBadIndx;
+            }
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, indices.lengthOf());
+
+    return numOfBadIndx;
+}
+
+///////////////////////////////////////////////////////////////////
+Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis) {
+
+    BUILD_SINGLE_SELECTOR(indices.dataType(), return checkIndices_, (indices, output, axis), INDEXING_TYPES);
+}
+
+///////////////////////////////////////////////////////////////////
 void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
 
     const int outRank = output.rankOf();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
index 308e58814..f6d8acc77 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
@@ -108,12 +108,12 @@ __host__ static void gatherCudaLauncher(const cudaStream_t *stream, const int nu
 void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray* indices, NDArray* output, const std::vector<int>& intArgs) {
 
     const int inputRank = input->rankOf();
-    int axis = intArgs.size() > 0 ? intArgs[0] : 0;
+    const int numOfIntArgs = intArgs.size();
+
+    int axis = numOfIntArgs > 0 ? intArgs[0] : 0;
     if(axis < 0)
         axis += inputRank;
 
-    const int numOfIntArgs = intArgs.size();
-
     if (indices == nullptr && numOfIntArgs == 2) { // scalar case
         output->assign((*input)(intArgs[1], {axis}));
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
index 11ba6571b..6b3bf5135 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
@@ -106,7 +106,7 @@ namespace nd4j {
                     const auto xOffset = shape::getOffset(xShapeInfo, xCoordStart);
 
                     z[zOffset] = x[xOffset];
-                    printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]);
+                    // printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]);
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index 501b9bca4..ec85efddf 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -31,142 +31,688 @@ namespace nd4j    {
 namespace ops     {
 namespace helpers {
 
-            // template<typename T, bool locking>
-            // __global__ static void scatterCuda(const int opCode, const int numOfSubArrs,
-            //                                          void* vx, const Nd4jLong *xShapeInfo, const Nd4jLong *xOffsets,
-            //                                          void* vy, const Nd4jLong *yShapeInfo, const Nd4jLong *yOffsets,
-            //                                          const int* indexes, unsigned int arrLenX, unsigned int arrLenY) {
 
-            //     __shared__ T *x, *y;
+///////////////////////////////////////////////////////////////////
+// x - indices, y - contains number of bad indices, z - input/output
+template<typename X>
+__global__ static void checkIndicesCuda(const void *vx, const Nd4jLong *xShapeInfo, Nd4jLong* y, const Nd4jLong *zShapeInfo, const int axis) {
 
-            //     if (locking) {
+    const auto x = reinterpret_cast<const X*>(vx);
 
-            //         for (int e = 0; e < numOfSubArrs; e++) {
+    __shared__ int xRank, *coords, xLastDim;
+    __shared__ Nd4jLong xLen, numOfBadIndxPerBlock;
 
-            //             const auto xIndex = indexes[e];
-            //             const bool isOwner = xIndex < gridDim.x ? blockIdx.x == xIndex : blockIdx.x == xIndex % gridDim.x;
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        coords = reinterpret_cast<int*>(shmem);
 
-            //             if (!isOwner)
-            //                 continue;
+        xRank = shape::rank(xShapeInfo);
+        xLen  = shape::length(xShapeInfo);
 
-            //             if (threadIdx.x == 0) {
-            //                 x = reinterpret_cast<T *>(vx) + xOffsets[xIndex];
-            //                 y = reinterpret_cast<T *>(vy) + yOffsets[e];
-            //             }
-            //             __syncthreads();
+        numOfBadIndxPerBlock = 0;
+    }
+    __syncthreads();
 
-            //             for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
+    auto xCoords = coords + threadIdx.x * xRank;
 
-            //                 const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
-            //                 const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+    for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < xLen; i += gridDim.x * blockDim.x) {
 
-            //                 switch (opCode) {
-            //                     case pairwise::Add:
-            //                         x[xOffset] += y[yOffset];
-            //                         break;
-            //                     case pairwise::Subtract:
-            //                         x[xOffset] -= y[yOffset];
-            //                         break;
-            //                     case pairwise::Multiply:
-            //                         x[xOffset] *= y[yOffset];
-            //                         break;
-            //                     case pairwise::Divide:
-            //                         x[xOffset] /= y[yOffset];
-            //                         break;
-            //                     case pairwise::ReverseSubtract:
-            //                         x[xOffset] = y[yOffset] - x[xOffset];
-            //                         break;
-            //                     case pairwise::ReverseDivide:
-            //                         x[xOffset] = y[yOffset] / x[xOffset];
-            //                         break;
-            //                     case pairwise::CopyPws:
-            //                         x[xOffset] = y[yOffset];
-            //                         break;
-            //                     default:
-            //                         continue;
-            //                 }
-            //             }
-            //             __syncthreads();
-            //         }
-            //     } else {
-            //         for (int e = blockIdx.x; e < numOfSubArrs; e+= gridDim.x) {
+        shape::index2coords(i, xShapeInfo, xCoords);
 
-            //             if (threadIdx.x == 0) {
-            //                 const auto xIndex = indexes[e];
-            //                 x = reinterpret_cast<T *>(vx) + xOffsets[xIndex];
-            //                 y = reinterpret_cast<T *>(vy) + yOffsets[e];
-            //             }
-            //             __syncthreads();
+        const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)];
 
-            //             for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
-            //                 const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
-            //                 const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+        if(currentInd >= shape::sizeAt(zShapeInfo, axis == -1 ? xCoords[xRank-1] : axis)) {
+            printf("checkIndices cuda: out of range element %lld at index %lld \n", currentInd,  i);
+            nd4j::math::atomics::nd4j_atomicAdd<Nd4jLong>(&numOfBadIndxPerBlock, 1);
+        }
+    }
+    __syncthreads();
 
-            //                 switch (opCode) {
-            //                     case pairwise::Add:
-            //                         x[xOffset] += y[yOffset];
-            //                         break;
-            //                     case pairwise::Subtract:
-            //                         x[xOffset] -= y[yOffset];
-            //                         break;
-            //                     case pairwise::Multiply:
-            //                         x[xOffset] *= y[yOffset];
-            //                         break;
-            //                     case pairwise::Divide:
-            //                         x[xOffset] /= y[yOffset];
-            //                         break;
-            //                     case pairwise::ReverseSubtract:
-            //                         x[xOffset] = y[yOffset] - x[xOffset];
-            //                         break;
-            //                     case pairwise::ReverseDivide:
-            //                         x[xOffset] = y[yOffset] / x[xOffset];
-            //                         break;
-            //                     case pairwise::CopyPws:
-            //                         x[xOffset] = y[yOffset];
-            //                         break;
-            //                     default:
-            //                         continue;
-            //                 }
-            //             }
-            //             __syncthreads();
-            //         }
-            //     }
-            // }
+    if (threadIdx.x == 0 && numOfBadIndxPerBlock != 0)
+        nd4j::math::atomics::nd4j_atomicAdd<Nd4jLong>(y, numOfBadIndxPerBlock);
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X>
+static void checkIndicesCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                              const void *vx, const Nd4jLong *xShapeInfo, Nd4jLong* y, const Nd4jLong *zShapeInfo, const int axis) {
+
+    checkIndicesCuda<X><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, y, zShapeInfo, axis);
+}
 
 
-            // template <typename T>
-            // void scatter_(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
-            //     std::vector<int> dims = {0};
-            //     auto inverted = ShapeUtils::evalDimsToExclude(output.rankOf(), dims);
+///////////////////////////////////////////////////////////////////
+Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis) {
 
-            //     auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), inverted);
-            //     auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), inverted);
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = threadsPerBlock * sizeof(int) * indices.rankOf() + 256;
 
-            //     auto psX = packX.specialShapeInfo();
-            //     auto psY = packY.specialShapeInfo();
+    const auto xType = indices.dataType();
 
-            //     PointersManager manager(context, "scatter");
+    PointersManager manager(context, "scatterNDcheckIndices");
 
-            //     auto poX = packX.specialOffsets();
-            //     auto poY = packY.specialOffsets();
+    // scalar, initial value = 0
+    NDArray numOfBadIndx(nd4j::DataType::INT64, context, true);
 
-            //     NDArray::prepareSpecialUse({&output}, {&updates, &indices});
+    NDArray::prepareSpecialUse({&numOfBadIndx}, {&indices});
+    BUILD_SINGLE_SELECTOR(xType, checkIndicesCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), reinterpret_cast<Nd4jLong*>(numOfBadIndx.getSpecialBuffer()), output.getSpecialShapeInfo(), axis), INDEXING_TYPES);
+    NDArray::registerSpecialUse({&numOfBadIndx}, {&indices});
 
-            //     unsigned int tadLengthX = shape::length(packX.primaryShapeInfo());
-            //     unsigned int tadLengthY = shape::length(packY.primaryShapeInfo());
-            //     if (tadLengthX != tadLengthY)
-            //         throw std::runtime_error("scatter: Lengths of TADs must be equal");
+    manager.synchronize();
 
-            //     auto blockSize = nd4j::math::nd4j_max<int>(32, nd4j::math::nd4j_min<int>(tadLengthX, 1024));
+    return numOfBadIndx.t<Nd4jLong>(0);
+}
 
-            //     if (lock)
-            //         scatterCuda<T, true><<<512, blockSize, 1024, *context->getCudaStream()>>>(op, indices.lengthOf(), output.getSpecialBuffer(), psX, poX, updates.getSpecialBuffer(), psY, poY, reinterpret_cast<int *>(indices.getSpecialBuffer()), tadLengthX, tadLengthY);
-            //     else
-            //         scatterCuda<T, false><<<512, blockSize, 1024, *context->getCudaStream()>>>(op, indices.lengthOf(), output.getSpecialBuffer(), psX, poX, updates.getSpecialBuffer(), psY, poY, reinterpret_cast<int *>(indices.getSpecialBuffer()), tadLengthX, tadLengthY);
+///////////////////////////////////////////////////////////////////
+// x - indices, y - updates, z - input/output
+template<typename X, typename Y>
+__global__ static void scatterLockCuda(const int opCode,
+                                        const void *vx, const Nd4jLong *xShapeInfo,
+                                        const void *vy, const Nd4jLong *yShapeInfo,
+                                              void *vz, const Nd4jLong *zShapeInfo) {
+
+    const auto x = reinterpret_cast<const X*>(vx);
+    const auto y = reinterpret_cast<const Y*>(vy);
+          auto z = reinterpret_cast<Y*>(vz);
+
+    __shared__ int xRank, yRank, zRank, xNonUnitDim, yNonUnitDim, zNonUnitDim, *coords;
+    __shared__ Nd4jLong xLen, zLen;
+    __shared__ bool is1Dcase, xySameStride;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        coords = reinterpret_cast<int*>(shmem);
+
+        xLen = shape::length(xShapeInfo);
+        zLen = shape::length(zShapeInfo);
+
+        xRank = shape::rank(xShapeInfo);
+        yRank = shape::rank(yShapeInfo);
+        zRank = shape::rank(zShapeInfo);
+
+        xNonUnitDim = yNonUnitDim = zNonUnitDim = 0;
+
+        is1Dcase = (shape::isCommonVector(zShapeInfo, zNonUnitDim) || shape::isScalar(zShapeInfo)) && (shape::isCommonVector(yShapeInfo, yNonUnitDim) || shape::isScalar(yShapeInfo)) && (shape::isCommonVector(xShapeInfo, xNonUnitDim) || shape::isScalar(xShapeInfo));
+
+        if(is1Dcase)
+            xySameStride = shape::stride(xShapeInfo)[xNonUnitDim] = shape::stride(yShapeInfo)[yNonUnitDim];
+    }
+    __syncthreads();
+
+
+    Nd4jLong yOffset, zOffset;
+    int zFirstCoord, *yCoords, *zCoords;
+
+    for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
+
+        if(!is1Dcase) {
+
+            yCoords = coords + threadIdx.x * (yRank + zRank);
+            zCoords = yCoords + yRank;
+            shape::index2coords(i, zShapeInfo, zCoords);
+        }
+
+        for (Nd4jLong j = 0; j < xLen; ++j) {
+
+            if(is1Dcase) {
+
+                yOffset = j * shape::stride(yShapeInfo)[yNonUnitDim];
+                zFirstCoord = x[xySameStride ? yOffset : j * shape::stride(xShapeInfo)[xNonUnitDim]];
+
+                if(i != zFirstCoord)
+                    continue;
+
+                zOffset = i * shape::stride(zShapeInfo)[zNonUnitDim];
+            }
+
+            else {
+
+                shape::index2coords(j, xShapeInfo, yCoords);                 // first xRank coordinates in yCoords are the same for y and x
+
+                zFirstCoord = x[shape::getOffset(xShapeInfo, yCoords)];
+
+                if(zCoords[0] != zFirstCoord)
+                    continue;
+
+                for (uint k = 0; k < yRank - xRank; ++k)
+                    yCoords[xRank + k] = zCoords[k + 1];
+
+                yOffset = shape::getOffset(yShapeInfo, yCoords);
+                zOffset = shape::getOffset(zShapeInfo, zCoords);
+            }
+
+            switch (opCode) {
+                case pairwise::Add:
+                    z[zOffset] += y[yOffset];
+                    break;
+                case pairwise::Subtract:
+                    z[zOffset] -= y[yOffset];
+                    break;
+                case pairwise::Multiply:
+                    z[zOffset] *= y[yOffset];
+                    break;
+                case pairwise::Divide:
+                    z[zOffset] /= y[yOffset];
+                    break;
+                case pairwise::ReverseSubtract:
+                    z[zOffset] = y[yOffset] - z[zOffset];
+                    break;
+                case pairwise::ReverseDivide:
+                    z[zOffset] = y[yOffset] / z[zOffset];
+                    break;
+                case pairwise::CopyPws:
+                    z[zOffset] = y[yOffset];
+                    break;
+                case pairwise::MaxPairwise:
+                    if(z[zOffset] < y[yOffset]) z[zOffset] = y[yOffset];
+                    break;
+                case pairwise::MinPairwise:
+                    if(z[zOffset] > y[yOffset]) z[zOffset] = y[yOffset];
+                    break;
+                default:
+                    continue;
+            }
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////
+// x - indices, y - updates, z - input/output
+template<typename X, typename Y>
+__global__ static void scatterCuda(const int opCode,
+                                   const void *vx, const Nd4jLong *xShapeInfo,
+                                   const void *vy, const Nd4jLong *yShapeInfo,
+                                         void *vz, const Nd4jLong *zShapeInfo) {
+
+    const auto x = reinterpret_cast<const X*>(vx);
+    const auto y = reinterpret_cast<const Y*>(vy);
+          auto z = reinterpret_cast<Y*>(vz);
+
+    __shared__ int xRank, yRank, zRank, xNonUnitDim, yNonUnitDim, zNonUnitDim, *coords;
+    __shared__ Nd4jLong yLen;
+    __shared__ bool is1Dcase, xySameStride;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        coords = reinterpret_cast<int*>(shmem);
+
+        yLen = shape::length(yShapeInfo);
+
+        xRank = shape::rank(xShapeInfo);
+        yRank = shape::rank(yShapeInfo);
+        zRank = shape::rank(zShapeInfo);
+
+        xNonUnitDim = yNonUnitDim = zNonUnitDim = 0;
+
+        is1Dcase = (shape::isCommonVector(zShapeInfo, zNonUnitDim) || shape::isScalar(zShapeInfo)) && (shape::isCommonVector(yShapeInfo, yNonUnitDim) || shape::isScalar(yShapeInfo)) && (shape::isCommonVector(xShapeInfo, xNonUnitDim) || shape::isScalar(xShapeInfo));
+
+        if(is1Dcase)
+            xySameStride = shape::stride(xShapeInfo)[xNonUnitDim] = shape::stride(yShapeInfo)[yNonUnitDim];
+    }
+    __syncthreads();
+
+
+    Nd4jLong xOffset, yOffset, zOffset;
+    int *yCoords, *zCoords;
+
+    if(!is1Dcase) {
+        yCoords = coords + threadIdx.x * (yRank + zRank);
+        zCoords = yCoords + yRank;
+    }
+
+    for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < yLen; i += gridDim.x * blockDim.x) {
+
+        if(is1Dcase) {
+
+            yOffset = i * shape::stride(yShapeInfo)[yNonUnitDim];
+            zOffset = x[xySameStride ? yOffset : i * shape::stride(xShapeInfo)[xNonUnitDim]] * shape::stride(zShapeInfo)[zNonUnitDim];
+        }
+        else {
+            shape::index2coords(i, yShapeInfo, yCoords);
+
+            yOffset = shape::getOffset(yShapeInfo, yCoords);
+            xOffset = shape::getOffset(xShapeInfo, yCoords);                // first xRank coordinates in yCoords are the same for y and x -> for (uint j = 0; j < xRank; ++j) xCoords[j] = yCoords[j];
+
+            zCoords[0] = x[xOffset];
+
+            for (uint j = 0; j < yRank - xRank; ++j)
+                zCoords[j + 1] = yCoords[xRank + j];
+
+            zOffset = shape::getOffset(zShapeInfo, zCoords);
+        }
+
+        switch (opCode) {
+            case pairwise::Add:
+                z[zOffset] += y[yOffset];
+                break;
+            case pairwise::Subtract:
+                z[zOffset] -= y[yOffset];
+                break;
+            case pairwise::Multiply:
+                z[zOffset] *= y[yOffset];
+                break;
+            case pairwise::Divide:
+                z[zOffset] /= y[yOffset];
+                break;
+            case pairwise::ReverseSubtract:
+                z[zOffset] = y[yOffset] - z[zOffset];
+                break;
+            case pairwise::ReverseDivide:
+                z[zOffset] = y[yOffset] / z[zOffset];
+                break;
+            case pairwise::CopyPws:
+                z[zOffset] = y[yOffset];
+                break;
+            case pairwise::MaxPairwise:
+                if(z[zOffset] < y[yOffset]) z[zOffset] = y[yOffset];
+                break;
+            case pairwise::MinPairwise:
+                if(z[zOffset] > y[yOffset]) z[zOffset] = y[yOffset];
+                break;
+            default:
+                continue;
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+static void scatterCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                const int opCode,
+                                const void *vx, const Nd4jLong *xShapeInfo,
+                                const void *vy, const Nd4jLong *yShapeInfo,
+                                      void *vz, const Nd4jLong *zShapeInfo,
+                                const bool lock) {
+
+    if(lock)
+        scatterLockCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
+    else
+        scatterCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
+}
+
+
+///////////////////////////////////////////////////////////////////
+void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+
+    const auto xType = indices.dataType();
+    const auto yType = updates.dataType();
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 4;
+    const int blocksPerGrid = ((lock ? output.lengthOf() : updates.lengthOf()) + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = sizeof(int) * threadsPerBlock * (updates.rankOf() + output.rankOf()) + 256;
+
+    PointersManager manager(context, "scatter");
+
+    NDArray::prepareSpecialUse({&output}, {&updates, &indices});
+    BUILD_DOUBLE_SELECTOR(xType, yType, scatterCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), op, indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), updates.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), lock), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
+    NDArray::registerSpecialUse({&output}, {&updates, &indices});
+
+    manager.synchronize();
+}
+
+///////////////////////////////////////////////////////////////////
+// x - indices, y - updates, z - output
+template<typename X, typename Y>
+__global__ static void scatterNDLockCuda(const int opCode,
+                                        const void *vx, const Nd4jLong *xShapeInfo,
+                                        const void *vy, const Nd4jLong *yShapeInfo,
+                                              void *vz, const Nd4jLong *zShapeInfo) {
+
+    const auto x = reinterpret_cast<const X*>(vx);
+    const auto y = reinterpret_cast<const Y*>(vy);
+          auto z = reinterpret_cast<Y*>(vz);
+
+    __shared__ int xRank, yRank, zRank, biggerXYRank, xLastDim, *coords, xNonUnitDim, yNonUnitDim, zNonUnitDim;
+    __shared__ Nd4jLong zLen, len;
+    __shared__ bool is1Dcase;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        coords = reinterpret_cast<int*>(shmem);
+
+        xRank = shape::rank(xShapeInfo);
+        yRank = shape::rank(yShapeInfo);
+        zRank = shape::rank(zShapeInfo);
+        xLastDim = shape::sizeAt(xShapeInfo, -1);
+
+        biggerXYRank = xRank > yRank ? xRank : yRank;
+
+        xNonUnitDim = yNonUnitDim = zNonUnitDim = 0;
+
+        is1Dcase = (shape::isCommonVector(zShapeInfo, zNonUnitDim) || shape::isScalar(zShapeInfo)) && (shape::isCommonVector(yShapeInfo, yNonUnitDim) || shape::isScalar(yShapeInfo)) && (shape::isCommonVector(xShapeInfo, xNonUnitDim) || shape::isScalar(xShapeInfo));
+
+        len  = is1Dcase ?  shape::length(xShapeInfo) : shape::length(xShapeInfo) / xLastDim;
+        zLen = shape::length(zShapeInfo);
+    }
+    __syncthreads();
+
+    Nd4jLong yOffset, zOffset, xOffset;
+    int *yCoords, *zCoords;
+
+    if(!is1Dcase) {
+        yCoords = coords + threadIdx.x * (biggerXYRank + zRank);
+        zCoords = yCoords + biggerXYRank;
+    }
+
+    for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
+
+        if(!is1Dcase)
+            shape::index2coords(i, zShapeInfo, zCoords);
+
+        for (Nd4jLong j = 0; j < len; ++j) {        // if !is1Dcase then we loop through first xRank-1 dimensions of x, that is we exclude last x dimension
+
+            if(is1Dcase) {
+
+                if(x[j * shape::stride(xShapeInfo)[xNonUnitDim]] != i)
+                    continue;
+
+                yOffset = j * shape::stride(yShapeInfo)[yNonUnitDim];
+                zOffset = i * shape::stride(zShapeInfo)[zNonUnitDim];
+            }
+            else {
+
+                shape::index2coords(j, xRank-1, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), yCoords);        // first xRank-1 coordinates in yCoords are the same for y and x
+
+                // first iteration
+                yCoords[xRank - 1] = 0;
+                xOffset = shape::getOffset(xShapeInfo, yCoords);
+                if(zCoords[0] != x[xOffset])
+                    continue;
+
+                // rest iterations
+                bool matched = true;
+                for (uint k = 1; k < xLastDim; ++k) {
+                    yCoords[xRank - 1] = k;
+                    xOffset += shape::stride(xShapeInfo)[xRank-1];
+                    if(zCoords[k] != x[xOffset]) {
+                        matched = false;
+                        break;
+                    }
+                }
+
+                if(!matched)
+                    continue;
+
+                for (uint k = xLastDim; k < zRank; ++k)
+                    yCoords[yRank - zRank + k] = zCoords[k];
+
+                yOffset = shape::getOffset(yShapeInfo, yCoords);
+                zOffset = shape::getOffset(zShapeInfo, zCoords);
+            }
+
+            switch (opCode) {
+                case pairwise::Add:
+                    z[zOffset] += y[yOffset];
+                    break;
+                case pairwise::Subtract:
+                    z[zOffset] -= y[yOffset];
+                    break;
+                case pairwise::Multiply:
+                    z[zOffset] *= y[yOffset];
+                    break;
+                case pairwise::Divide:
+                    z[zOffset] /= y[yOffset];
+                    break;
+                case pairwise::ReverseSubtract:
+                    z[zOffset] = y[yOffset] - z[zOffset];
+                    break;
+                case pairwise::ReverseDivide:
+                    z[zOffset] = y[yOffset] / z[zOffset];
+                    break;
+                case pairwise::CopyPws:
+                    z[zOffset] = y[yOffset];
+                    break;
+                case pairwise::MaxPairwise:
+                    if(z[zOffset] < y[yOffset]) z[zOffset] = y[yOffset];
+                    break;
+                case pairwise::MinPairwise:
+                    if(z[zOffset] > y[yOffset]) z[zOffset] = y[yOffset];
+                    break;
+                default:
+                    continue;
+            }
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+// x - indices, y - updates, z - output
+template<typename X, typename Y>
+__global__ static void scatterNDCuda(const int opCode,
+                                     const void *vx, const Nd4jLong *xShapeInfo,
+                                     const void *vy, const Nd4jLong *yShapeInfo,
+                                           void *vz, const Nd4jLong *zShapeInfo) {
+
+    const auto x = reinterpret_cast<const X*>(vx);
+    const auto y = reinterpret_cast<const Y*>(vy);
+          auto z = reinterpret_cast<Y*>(vz);
+
+    __shared__ int xRank, yRank, zRank, biggerXYRank, xLastDim, *coords, xNonUnitDim, yNonUnitDim, zNonUnitDim;
+    __shared__ Nd4jLong yLen;
+    __shared__ bool is1Dcase;
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        coords = reinterpret_cast<int*>(shmem);
+
+        yLen  = shape::length(yShapeInfo);
+        xRank = shape::rank(xShapeInfo);
+        yRank = shape::rank(yShapeInfo);
+        zRank = shape::rank(zShapeInfo);
+        xLastDim = shape::sizeAt(xShapeInfo, -1);
+
+        biggerXYRank = xRank > yRank ? xRank : yRank;
+
+        xNonUnitDim = yNonUnitDim = zNonUnitDim = 0;
+
+        is1Dcase = (shape::isCommonVector(zShapeInfo, zNonUnitDim) || shape::isScalar(zShapeInfo)) && (shape::isCommonVector(yShapeInfo, yNonUnitDim) || shape::isScalar(yShapeInfo)) && (shape::isCommonVector(xShapeInfo, xNonUnitDim) || shape::isScalar(xShapeInfo));
+    }
+    __syncthreads();
+
+    Nd4jLong yOffset, zOffset;
+    int *yCoords, *zCoords;
+
+    if(!is1Dcase) {
+        yCoords = coords + threadIdx.x * (biggerXYRank + zRank);
+        zCoords = yCoords + biggerXYRank;
+    }
+
+    for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < yLen; i += gridDim.x * blockDim.x) {
+
+        if(is1Dcase) {
+
+            yOffset = i * shape::stride(yShapeInfo)[zNonUnitDim];
+            zOffset = x[i * shape::stride(xShapeInfo)[xNonUnitDim]] * shape::stride(zShapeInfo)[zNonUnitDim];
+        }
+        else {
+
+            shape::index2coords(i, yShapeInfo, yCoords);
+
+            yOffset = shape::getOffset(yShapeInfo, yCoords);
+
+            if(yRank >= xRank)
+                zCoords[xLastDim] = yCoords[xRank - 1];                // saving y coordinate, since it might be changed in next instructions
+
+            for (uint j = 0; j < xLastDim; ++j) {                      // first xRank-1 coordinates in yCoords are the same for y and x
+                yCoords[xRank - 1] = j;
+                zCoords[j] = x[shape::getOffset(xShapeInfo, yCoords)];
+            }
+
+            for (uint j = xLastDim + 1; j < zRank; ++j)
+                zCoords[j] = yCoords[yRank - zRank + j];
+
+            zOffset = shape::getOffset(zShapeInfo, zCoords);
+        }
+
+        switch (opCode) {
+            case pairwise::Add:
+                z[zOffset] += y[yOffset];
+                break;
+            case pairwise::Subtract:
+                z[zOffset] -= y[yOffset];
+                break;
+            case pairwise::Multiply:
+                z[zOffset] *= y[yOffset];
+                break;
+            case pairwise::Divide:
+                z[zOffset] /= y[yOffset];
+                break;
+            case pairwise::ReverseSubtract:
+                z[zOffset] = y[yOffset] - z[zOffset];
+                break;
+            case pairwise::ReverseDivide:
+                z[zOffset] = y[yOffset] / z[zOffset];
+                break;
+            case pairwise::CopyPws:
+                z[zOffset] = y[yOffset];
+                break;
+            case pairwise::MaxPairwise:
+                if(z[zOffset] < y[yOffset]) z[zOffset] = y[yOffset];
+                break;
+            case pairwise::MinPairwise:
+                if(z[zOffset] > y[yOffset]) z[zOffset] = y[yOffset];
+                break;
+            default:
+                continue;
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+static void scatterNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                  const int opCode,
+                                  const void *vx, const Nd4jLong *xShapeInfo,
+                                  const void *vy, const Nd4jLong *yShapeInfo,
+                                        void *vz, const Nd4jLong *zShapeInfo,
+                                  const bool lock) {
+
+    if(lock)
+        scatterNDLockCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
+    else
+        scatterNDCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
+}
+
+///////////////////////////////////////////////////////////////////
+void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+
+    const int xRank = indices.rankOf();
+    const int yRank = updates.rankOf();
+    const int zRank = output.rankOf();
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 4;
+    const int blocksPerGrid = ((lock ? output.lengthOf() : updates.lengthOf()) + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = threadsPerBlock * sizeof(int) * ((yRank > xRank ? yRank : xRank) + zRank) + 256;
+
+    const auto xType = indices.dataType();
+    const auto yType = updates.dataType();
+
+    PointersManager manager(context, "scatterND");
+
+    NDArray::prepareSpecialUse({&output}, {&updates, &indices});
+    BUILD_DOUBLE_SELECTOR(xType, yType, scatterNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), op, indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), updates.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), lock), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
+    NDArray::registerSpecialUse({&output}, {&updates, &indices});
+
+    manager.synchronize();
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Z>
+__global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
+                                         void *vy, const Nd4jLong *yShapeInfo,
+                                         void *vz, const Nd4jLong *zShapeInfo) {
+
+    const auto x = reinterpret_cast<const X*>(vx);
+          auto y = reinterpret_cast<Z*>(vy);
+          auto z = reinterpret_cast<Z*>(vz);
+
+    __shared__ Nd4jLong xLen, *sharedMem;
+    __shared__ int xRank;   // xRank = zRank, yRank = xRank + 1
+
+    if (threadIdx.x == 0) {
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        xLen  = shape::length(xShapeInfo);
+        xRank = shape::rank(xShapeInfo);
+    }
+    __syncthreads();
+
+    const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if(xInd >= xLen)
+        return;
+
+    auto coords = sharedMem + threadIdx.x * (xRank + 1);
+
+    shape::index2coords(xInd, xShapeInfo, coords);
+
+    // y last coordinate
+    coords[xRank] = x[shape::getOffset(xShapeInfo, coords)];
+
+    const auto yOffset = shape::getOffset(yShapeInfo, coords);
+
+    if(z == nullptr) { // gradient calculation
+        y[yOffset] -= 1.f;
+    }
+    else {
+        z[shape::getOffset(zShapeInfo, coords)] = y[yOffset];
+    }
+}
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Z>
+static void scatterForLossCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void *vx, const Nd4jLong* xShapeInfo, void *vy, const Nd4jLong* yShapeInfo, void *vz, const Nd4jLong* zShapeInfo) {
+
+    scatterForLossCuda<X, Z><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
+}
+
+///////////////////////////////////////////////////////////////////
+void scatterForLoss(nd4j::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad) {
+    // shapes of indices and output must be the same
+    // shape of indices should be the same as updates shape with last dimension excluded, for example if updates is {a,b,c} then indices should be {a,b}
+
+    PointersManager manager(context, "scatterForLoss");
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+
+    if(calcGrad) {
+        NDArray::prepareSpecialUse({&updates}, {&indices});
+        BUILD_DOUBLE_SELECTOR(indices.dataType(), updates.dataType(), scatterForLossCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.specialBuffer(), updates.specialShapeInfo(), nullptr, nullptr), INDEXING_TYPES, FLOAT_TYPES);
+        NDArray::registerSpecialUse({&updates}, {&indices});
+    }
+    else {
+        NDArray::prepareSpecialUse({&output}, {&indices, &updates});
+        BUILD_DOUBLE_SELECTOR(indices.dataType(), updates.dataType(), scatterForLossCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), updates.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo()), INDEXING_TYPES, FLOAT_TYPES);
+        NDArray::registerSpecialUse({&output}, {&indices, &updates});
+    }
+
+    manager.synchronize();
+}
+
+}
+}
+}
+
+
+/*
+
+///////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+static void scatterLockCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                    const int opCode,
+                                    const void* vx, const Nd4jLong *xShapeInfo,
+                                    const void* vy, const Nd4jLong *yTadShapeInfo, const Nd4jLong *yOffsets,
+                                          void* vz, const Nd4jLong *zTadShapeInfo, const Nd4jLong *zOffsets,
+                                    const Nd4jLong xLen, const Nd4jLong yTadLen, const Nd4jLong zTadLen) {
+
+    scatterLockCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yTadShapeInfo, yOffsets, vz, zTadShapeInfo, zOffsets, xLen, yTadLen, zTadLen);
+}
 
-            //      NDArray::registerSpecialUse({&output}, {&updates, &indices});
-            //     manager.synchronize();
-            // }
 
 ///////////////////////////////////////////////////////////////////
 // x - indices, y - updates, z - input/output
@@ -177,6 +723,35 @@ __global__ static void scatterLockCuda(const int opCode,
                                              void* vz, const Nd4jLong *zTadShapeInfo, const Nd4jLong *zOffsets,
                                        const Nd4jLong xLen, const Nd4jLong yTadLen, const Nd4jLong zTadLen) {
 
+
+
+ const int xRank = indices.rankOf();
+
+        std::vector<int> zTadDims = ShapeUtils::evalDimsToExclude(output.rankOf(), {0});
+
+        int sizeOfUpdDims = xRank;
+        if(output.rankOf() == updates.rankOf() && indices.isVector())
+            sizeOfUpdDims = 1;
+
+        std::vector<int> yTadDims(sizeOfUpdDims);
+        std::iota(yTadDims.begin(), yTadDims.end(), 0);
+
+        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), ShapeUtils::evalDimsToExclude(updates.rankOf(), yTadDims));
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
+
+        const Nd4jLong zTadLen = shape::length(packZ.primaryShapeInfo());
+        const Nd4jLong yTadLen = shape::length(packY.primaryShapeInfo());
+
+        const auto threadsPerBlock = nd4j::math::nd4j_max<int>(32, nd4j::math::nd4j_min<int>(zTadLen, 1024));
+        const auto blocksPerGrid = indices.lengthOf();
+
+        const auto xType = indices.dataType();
+        const auto yType = updates.dataType();
+
+        BUILD_DOUBLE_SELECTOR(xType, yType, scatterLockCudaLauncher, (blocksPerGrid, threadsPerBlock, 1024, context->getCudaStream(), op, indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), packY.specialShapeInfo(), packY.specialOffsets(), output.getSpecialBuffer(), packZ.specialShapeInfo(), packZ.specialOffsets(), indices.lengthOf(), yTadLen, zTadLen), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
+
+
+
     const auto x = reinterpret_cast<const X*>(vx);
     const auto y = reinterpret_cast<const Y*>(vy);
           auto z = reinterpret_cast<Y*>(vz);
@@ -280,161 +855,143 @@ __global__ static void scatterLockCuda(const int opCode,
     }
 }
 
-///////////////////////////////////////////////////////////////////
-template<typename X, typename Y>
-static void scatterLockCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                    const int opCode,
-                                    const void* vx, const Nd4jLong *xShapeInfo,
-                                    const void* vy, const Nd4jLong *yTadShapeInfo, const Nd4jLong *yOffsets,
-                                          void* vz, const Nd4jLong *zTadShapeInfo, const Nd4jLong *zOffsets,
-                                    const Nd4jLong xLen, const Nd4jLong yTadLen, const Nd4jLong zTadLen) {
+            template<typename T, bool locking>
+            __global__ static void scatterCuda(const int opCode, const int numOfSubArrs,
+                                                     void* vx, const Nd4jLong *xShapeInfo, const Nd4jLong *xOffsets,
+                                                     void* vy, const Nd4jLong *yShapeInfo, const Nd4jLong *yOffsets,
+                                                     const int* indexes, unsigned int arrLenX, unsigned int arrLenY) {
 
-    scatterLockCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yTadShapeInfo, yOffsets, vz, zTadShapeInfo, zOffsets, xLen, yTadLen, zTadLen);
-}
+                __shared__ T *x, *y;
 
-///////////////////////////////////////////////////////////////////
-// x - indices, y - updates, z - input/output
-template<typename X, typename Y>
-__global__ static void scatterCuda(const int opCode,
-                                   const void *vx, const Nd4jLong *xShapeInfo,
-                                   const void *vy, const Nd4jLong *yShapeInfo,
-                                         void *vz, const Nd4jLong *zShapeInfo) {
+                if (locking) {
 
-    const auto x = reinterpret_cast<const X*>(vx);
-    const auto y = reinterpret_cast<const Y*>(vy);
-          auto z = reinterpret_cast<Y*>(vz);
+                    for (int e = 0; e < numOfSubArrs; e++) {
 
-    __shared__ int xRank, yRank, zRank;
-    __shared__ Nd4jLong yLen, totalThreads, *coord;
+                        const auto xIndex = indexes[e];
+                        const bool isOwner = xIndex < gridDim.x ? blockIdx.x == xIndex : blockIdx.x == xIndex % gridDim.x;
 
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        coord = reinterpret_cast<Nd4jLong*>(shmem);
-        yLen = shape::length(yShapeInfo);
-        totalThreads = gridDim.x * blockDim.x;
-        xRank = shape::rank(xShapeInfo);
-        yRank = shape::rank(yShapeInfo);
-        zRank = shape::rank(zShapeInfo);
-    }
-    __syncthreads();
+                        if (!isOwner)
+                            continue;
 
-    auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
-    auto yCoord = xCoord + xRank;
-    auto zCoord = yCoord + yRank;
+                        if (threadIdx.x == 0) {
+                            x = reinterpret_cast<T *>(vx) + xOffsets[xIndex];
+                            y = reinterpret_cast<T *>(vy) + yOffsets[e];
+                        }
+                        __syncthreads();
 
-    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+                        for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
 
-    for (Nd4jLong i = tid; i < yLen; i += totalThreads) {
+                            const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+                            const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
 
-        shape::index2coords(i, yShapeInfo, yCoord);
+                            switch (opCode) {
+                                case pairwise::Add:
+                                    x[xOffset] += y[yOffset];
+                                    break;
+                                case pairwise::Subtract:
+                                    x[xOffset] -= y[yOffset];
+                                    break;
+                                case pairwise::Multiply:
+                                    x[xOffset] *= y[yOffset];
+                                    break;
+                                case pairwise::Divide:
+                                    x[xOffset] /= y[yOffset];
+                                    break;
+                                case pairwise::ReverseSubtract:
+                                    x[xOffset] = y[yOffset] - x[xOffset];
+                                    break;
+                                case pairwise::ReverseDivide:
+                                    x[xOffset] = y[yOffset] / x[xOffset];
+                                    break;
+                                case pairwise::CopyPws:
+                                    x[xOffset] = y[yOffset];
+                                    break;
+                                default:
+                                    continue;
+                            }
+                        }
+                        __syncthreads();
+                    }
+                } else {
+                    for (int e = blockIdx.x; e < numOfSubArrs; e+= gridDim.x) {
 
-        for (uint j = 0; j < xRank; ++j)
-            xCoord[j] = yCoord[j];
+                        if (threadIdx.x == 0) {
+                            const auto xIndex = indexes[e];
+                            x = reinterpret_cast<T *>(vx) + xOffsets[xIndex];
+                            y = reinterpret_cast<T *>(vy) + yOffsets[e];
+                        }
+                        __syncthreads();
 
-        const auto xOffset = shape::getOffset(xShapeInfo, xCoord);
-        zCoord[0] = x[xOffset];
+                        for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
+                            const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+                            const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
 
-        for (uint j = 0; j < yRank - xRank; ++j)
-            zCoord[j + 1] = yCoord[xRank + j];
-
-        const auto yOffset = shape::getOffset(yShapeInfo, yCoord);
-        const auto zOffset = shape::getOffset(zShapeInfo, zCoord);
-
-        switch (opCode) {
-            case pairwise::Add:
-                z[zOffset] += y[yOffset];
-                break;
-            case pairwise::Subtract:
-                z[zOffset] -= y[yOffset];
-                break;
-            case pairwise::Multiply:
-                z[zOffset] *= y[yOffset];
-                break;
-            case pairwise::Divide:
-                z[zOffset] /= y[yOffset];
-                break;
-            case pairwise::ReverseSubtract:
-                z[zOffset] = y[yOffset] - z[zOffset];
-                break;
-            case pairwise::ReverseDivide:
-                z[zOffset] = y[yOffset] / z[zOffset];
-                break;
-            case pairwise::CopyPws:
-                z[zOffset] = y[yOffset];
-                break;
-            case pairwise::MaxPairwise:
-                if(z[zOffset] < y[yOffset]) z[zOffset] = y[yOffset];
-                break;
-            case pairwise::MinPairwise:
-                if(z[zOffset] > y[yOffset]) z[zOffset] = y[yOffset];
-                break;
-            default:
-                continue;
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-template<typename X, typename Y>
-static void scatterCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                const int opCode,
-                                const void *vx, const Nd4jLong *xShapeInfo,
-                                const void *vy, const Nd4jLong *yShapeInfo,
-                                      void *vz, const Nd4jLong *zShapeInfo) {
-
-    scatterCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
-}
+                            switch (opCode) {
+                                case pairwise::Add:
+                                    x[xOffset] += y[yOffset];
+                                    break;
+                                case pairwise::Subtract:
+                                    x[xOffset] -= y[yOffset];
+                                    break;
+                                case pairwise::Multiply:
+                                    x[xOffset] *= y[yOffset];
+                                    break;
+                                case pairwise::Divide:
+                                    x[xOffset] /= y[yOffset];
+                                    break;
+                                case pairwise::ReverseSubtract:
+                                    x[xOffset] = y[yOffset] - x[xOffset];
+                                    break;
+                                case pairwise::ReverseDivide:
+                                    x[xOffset] = y[yOffset] / x[xOffset];
+                                    break;
+                                case pairwise::CopyPws:
+                                    x[xOffset] = y[yOffset];
+                                    break;
+                                default:
+                                    continue;
+                            }
+                        }
+                        __syncthreads();
+                    }
+                }
+            }
 
 
-///////////////////////////////////////////////////////////////////
-void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+            template <typename T>
+            void scatter_(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
+                std::vector<int> dims = {0};
+                auto inverted = ShapeUtils::evalDimsToExclude(output.rankOf(), dims);
 
-    PointersManager manager(context, "scatter");
+                auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), inverted);
+                auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), inverted);
 
-    NDArray::prepareSpecialUse({&output}, {&updates, &indices});
+                auto psX = packX.specialShapeInfo();
+                auto psY = packY.specialShapeInfo();
 
-    if(lock) {
+                PointersManager manager(context, "scatter");
 
-        const int xRank = indices.rankOf();
+                auto poX = packX.specialOffsets();
+                auto poY = packY.specialOffsets();
 
-        std::vector<int> zTadDims = ShapeUtils::evalDimsToExclude(output.rankOf(), {0});
+                NDArray::prepareSpecialUse({&output}, {&updates, &indices});
 
-        int sizeOfUpdDims = xRank;
-        if(output.rankOf() == updates.rankOf() && indices.isVector())
-            sizeOfUpdDims = 1;
+                unsigned int tadLengthX = shape::length(packX.primaryShapeInfo());
+                unsigned int tadLengthY = shape::length(packY.primaryShapeInfo());
+                if (tadLengthX != tadLengthY)
+                    throw std::runtime_error("scatter: Lengths of TADs must be equal");
 
-        std::vector<int> yTadDims(sizeOfUpdDims);
-        std::iota(yTadDims.begin(), yTadDims.end(), 0);
+                auto blockSize = nd4j::math::nd4j_max<int>(32, nd4j::math::nd4j_min<int>(tadLengthX, 1024));
 
-        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), ShapeUtils::evalDimsToExclude(updates.rankOf(), yTadDims));
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
+                if (lock)
+                    scatterCuda<T, true><<<512, blockSize, 1024, *context->getCudaStream()>>>(op, indices.lengthOf(), output.getSpecialBuffer(), psX, poX, updates.getSpecialBuffer(), psY, poY, reinterpret_cast<int *>(indices.getSpecialBuffer()), tadLengthX, tadLengthY);
+                else
+                    scatterCuda<T, false><<<512, blockSize, 1024, *context->getCudaStream()>>>(op, indices.lengthOf(), output.getSpecialBuffer(), psX, poX, updates.getSpecialBuffer(), psY, poY, reinterpret_cast<int *>(indices.getSpecialBuffer()), tadLengthX, tadLengthY);
 
-        const Nd4jLong zTadLen = shape::length(packZ.primaryShapeInfo());
-        const Nd4jLong yTadLen = shape::length(packY.primaryShapeInfo());
+                 NDArray::registerSpecialUse({&output}, {&updates, &indices});
+                manager.synchronize();
+            }
 
-        const auto threadsPerBlock = nd4j::math::nd4j_max<int>(32, nd4j::math::nd4j_min<int>(zTadLen, 1024));
-        const auto blocksPerGrid = indices.lengthOf();
-
-        const auto xType = indices.dataType();
-        const auto yType = updates.dataType();
-
-        BUILD_DOUBLE_SELECTOR(xType, yType, scatterLockCudaLauncher, (blocksPerGrid, threadsPerBlock, 1024, context->getCudaStream(), op, indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), packY.specialShapeInfo(), packY.specialOffsets(), output.getSpecialBuffer(), packZ.specialShapeInfo(), packZ.specialOffsets(), indices.lengthOf(), yTadLen, zTadLen), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
-    }
-    else {
-
-        const int threadsPerBlock = MAX_NUM_THREADS / 8;
-        const int blocksPerGrid = (updates.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = 8 * threadsPerBlock * (indices.rankOf() + updates.rankOf() + output.rankOf()) + 128;
-
-        const auto xType = indices.dataType();
-        const auto yType = updates.dataType();
-
-        BUILD_DOUBLE_SELECTOR(xType, yType, scatterCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), op, indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), updates.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo()), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
-    }
-
-    NDArray::registerSpecialUse({&output}, {&updates, &indices});
-    manager.synchronize();
-}
 
 
 ///////////////////////////////////////////////////////////////////
@@ -447,6 +1004,27 @@ __global__ static void scatterNDLockCuda(const int opCode,
                                          const Nd4jLong *zShapeInfo,
                                          const Nd4jLong numOfXTads, const Nd4jLong numOfZTads, const Nd4jLong yTadLen) {
 
+
+
+---------------------------------------------------------------------------
+const int xLastDim = indices.sizeAt(-1);
+
+        // y_tad and z_tad have the same shape
+        std::vector<int> yTadDims(zRank - xLastDim), zTadDims(zRank - xLastDim);
+        for (int j = 0, i = zTadDims.size() - 1; i >=0 ; --i, ++j) {
+            yTadDims[i] = yRank - 1 - j;
+            zTadDims[i] = zRank - 1 - j;
+        }
+
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(indices.getShapeInfo(), {xRank - 1});
+        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), yTadDims);
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
+
+        const int threadsPerBlock = MAX_NUM_THREADS / 4;
+        const int blocksPerGrid = packZ.numberOfTads();
+        const int sharedMem = 8 * threadsPerBlock * xLastDim + 128;
+---------------------------------------------------------------------------
+
     // zTadLen == yTadLen if numOfZTads > 1, in opposite case z and y are vectors
     // numOfXTads == numOfYTads if numOfZTads > 1, in opposite case z and y are vectors
 
@@ -565,252 +1143,7 @@ __global__ static void scatterNDLockCuda(const int opCode,
     }
 }
 
-///////////////////////////////////////////////////////////////////
-template<typename X, typename Y>
-static void scatterNDLockCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                      const int opCode,
-                                      const void* vx, const Nd4jLong *xTadShapeInfo, const Nd4jLong *xOffsets,
-                                      const void* vy, const Nd4jLong *yTadShapeInfo, const Nd4jLong *yOffsets,
-                                            void* vz, const Nd4jLong *zTadShapeInfo, const Nd4jLong *zOffsets,
-                                      const Nd4jLong *zShapeInfo,
-                                      const Nd4jLong numOfXTads, const Nd4jLong numOfZTads, const Nd4jLong zTadLen) {
-
-    scatterNDLockCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode,
-                                                                                   vx, xTadShapeInfo, xOffsets,
-                                                                                   vy, yTadShapeInfo, yOffsets,
-                                                                                   vz, zTadShapeInfo, zOffsets,
-                                                                                   zShapeInfo,
-                                                                                   numOfXTads, numOfZTads, zTadLen);
-}
-
-///////////////////////////////////////////////////////////////////
-// x - indices, y - updates, z - output
-template<typename X, typename Y>
-__global__ static void scatterNDCuda(const int opCode,
-                                     const void *vx, const Nd4jLong *xShapeInfo,
-                                     const void *vy, const Nd4jLong *yShapeInfo,
-                                           void *vz, const Nd4jLong *zShapeInfo) {
-
-    const auto x = reinterpret_cast<const X*>(vx);
-    const auto y = reinterpret_cast<const Y*>(vy);
-          auto z = reinterpret_cast<Y*>(vz);
-
-    __shared__ int xRank, yRank, zRank, xLastDim;
-    __shared__ Nd4jLong yLen, totalThreads, *coord;
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        coord = reinterpret_cast<Nd4jLong*>(shmem);
-        yLen = shape::length(yShapeInfo);
-        totalThreads = gridDim.x * blockDim.x;
-        xRank = shape::rank(xShapeInfo);
-        yRank = shape::rank(yShapeInfo);
-        zRank = shape::rank(zShapeInfo);
-        xLastDim = xShapeInfo[xRank];
-    }
-    __syncthreads();
-
-    auto xCoord = coord + threadIdx.x * (xRank + yRank + zRank);
-    auto yCoord = xCoord + xRank;
-    auto zCoord = yCoord + yRank;
-
-    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (Nd4jLong i = tid; i < yLen; i += totalThreads) {
-
-        shape::index2coords(i, yShapeInfo, yCoord);
-
-        for (uint j = 0; j < xRank - 1; ++j)
-            xCoord[j] = yCoord[j];
-
-        for (uint j = 0; j < xLastDim; ++j) {
-            xCoord[xRank - 1] = j;
-            const auto xOffset = shape::getOffset(xShapeInfo, xCoord);
-            zCoord[j] = x[xOffset];
-        }
-
-        for (uint j = xLastDim; j < zRank; ++j)
-            zCoord[j] = yCoord[yRank - zRank + j];
-
-        const auto yOffset = shape::getOffset(yShapeInfo, yCoord);
-        const auto zOffset = shape::getOffset(zShapeInfo, zCoord);
-
-        switch (opCode) {
-            case pairwise::Add:
-                z[zOffset] += y[yOffset];
-                break;
-            case pairwise::Subtract:
-                z[zOffset] -= y[yOffset];
-                break;
-            case pairwise::Multiply:
-                z[zOffset] *= y[yOffset];
-                break;
-            case pairwise::Divide:
-                z[zOffset] /= y[yOffset];
-                break;
-            case pairwise::ReverseSubtract:
-                z[zOffset] = y[yOffset] - z[zOffset];
-                break;
-            case pairwise::ReverseDivide:
-                z[zOffset] = y[yOffset] / z[zOffset];
-                break;
-            case pairwise::CopyPws:
-                z[zOffset] = y[yOffset];
-                break;
-            case pairwise::MaxPairwise:
-                if(z[zOffset] < y[yOffset]) z[zOffset] = y[yOffset];
-                break;
-            case pairwise::MinPairwise:
-                if(z[zOffset] > y[yOffset]) z[zOffset] = y[yOffset];
-                break;
-            default:
-                continue;
-        }
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-template<typename X, typename Y>
-static void scatterNDCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
-                                  const int opCode,
-                                  const void *vx, const Nd4jLong *xShapeInfo,
-                                  const void *vy, const Nd4jLong *yShapeInfo,
-                                        void *vz, const Nd4jLong *zShapeInfo) {
-
-    scatterNDCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(opCode, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
-}
-
-///////////////////////////////////////////////////////////////////
-void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
-
-    const int xRank = indices.rankOf();
-    const int yRank = updates.rankOf();
-    const int zRank = output.rankOf();
-
-    PointersManager manager(context, "scatterND");
-
-    NDArray::prepareSpecialUse({&output}, {&updates, &indices});
-
-    if(lock) {
-
-        const int xLastDim = indices.sizeAt(-1);
-
-        // y_tad and z_tad have the same shape
-        std::vector<int> yTadDims(zRank - xLastDim), zTadDims(zRank - xLastDim);
-        for (int j = 0, i = zTadDims.size() - 1; i >=0 ; --i, ++j) {
-            yTadDims[i] = yRank - 1 - j;
-            zTadDims[i] = zRank - 1 - j;
-        }
-
-        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(indices.getShapeInfo(), {xRank - 1});
-        auto packY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), yTadDims);
-        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output.getShapeInfo(), zTadDims);
-
-        const int threadsPerBlock = MAX_NUM_THREADS / 4;
-        const int blocksPerGrid = packZ.numberOfTads();
-        const int sharedMem = 8 * threadsPerBlock * xLastDim + 128;
-
-        const auto xType = indices.dataType();
-        const auto yType = updates.dataType();
-
-        BUILD_DOUBLE_SELECTOR(xType, yType, scatterNDLockCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), op, indices.getSpecialBuffer(), packX.specialShapeInfo(), packX.specialOffsets(), updates.getSpecialBuffer(), packY.specialShapeInfo(), packY.specialOffsets(), output.getSpecialBuffer(), packZ.specialShapeInfo(), packZ.specialOffsets(), output.getSpecialShapeInfo(), packX.numberOfTads(), packZ.numberOfTads(), shape::length(packY.primaryShapeInfo())), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
-    }
-    else {
-
-        const int threadsPerBlock = MAX_NUM_THREADS / 8;
-        const int blocksPerGrid = (updates.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = 8 * threadsPerBlock * (xRank + yRank + zRank) + 128;
-
-        const auto xType = indices.dataType();
-        const auto yType = updates.dataType();
-
-        BUILD_DOUBLE_SELECTOR(xType, yType, scatterNDCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), op, indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), updates.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo()), INDEXING_TYPES, GENERIC_NUMERIC_TYPES);
-    }
-
-    NDArray::registerSpecialUse({&output}, {&updates, &indices});
-    manager.synchronize();
-}
-
-///////////////////////////////////////////////////////////////////
-template<typename X, typename Z>
-__global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
-                                         void *vy, const Nd4jLong *yShapeInfo,
-                                         void *vz, const Nd4jLong *zShapeInfo) {
-
-    const auto x = reinterpret_cast<const X*>(vx);
-          auto y = reinterpret_cast<Z*>(vy);
-          auto z = reinterpret_cast<Z*>(vz);
-
-    __shared__ Nd4jLong xLen, *sharedMem;
-    __shared__ int xRank;   // xRank = zRank, yRank = xRank + 1
-
-    if (threadIdx.x == 0) {
-        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
-
-        xLen  = shape::length(xShapeInfo);
-        xRank = shape::rank(xShapeInfo);
-    }
-    __syncthreads();
-
-    const auto xInd = threadIdx.x + blockIdx.x * blockDim.x;
-
-    if(xInd >= xLen)
-        return;
-
-    auto coords = sharedMem + threadIdx.x * (xRank + 1);
-
-    shape::index2coords(xInd, xShapeInfo, coords);
-
-    // y last coordinate
-    coords[xRank] = x[shape::getOffset(xShapeInfo, coords)];
-
-    const auto yOffset = shape::getOffset(yShapeInfo, coords);
-
-    if(z == nullptr) { // gradient calculation
-        y[yOffset] -= 1.f;
-    }
-    else {
-        z[shape::getOffset(zShapeInfo, coords)] = y[yOffset];
-    }
-}
-
-///////////////////////////////////////////////////////////////////
-template<typename X, typename Z>
-static void scatterForLossCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream, const void *vx, const Nd4jLong* xShapeInfo, void *vy, const Nd4jLong* yShapeInfo, void *vz, const Nd4jLong* zShapeInfo) {
-
-    scatterForLossCuda<X, Z><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo);
-}
-
-///////////////////////////////////////////////////////////////////
-void scatterForLoss(nd4j::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad) {
-    // shapes of indices and output must be the same
-    // shape of indices should be the same as updates shape with last dimension excluded, for example if updates is {a,b,c} then indices should be {a,b}
-
-    PointersManager manager(context, "scatterForLoss");
-
-    const int threadsPerBlock = MAX_NUM_THREADS / 2;
-    const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
-
-    if(calcGrad) {
-        NDArray::prepareSpecialUse({&updates}, {&indices});
-        BUILD_DOUBLE_SELECTOR(indices.dataType(), updates.dataType(), scatterForLossCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.specialBuffer(), updates.specialShapeInfo(), nullptr, nullptr), INDEXING_TYPES, FLOAT_TYPES);
-        NDArray::registerSpecialUse({&updates}, {&indices});
-    }
-    else {
-        NDArray::prepareSpecialUse({&output}, {&indices, &updates});
-        BUILD_DOUBLE_SELECTOR(indices.dataType(), updates.dataType(), scatterForLossCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), indices.getSpecialBuffer(), indices.getSpecialShapeInfo(), updates.getSpecialBuffer(), updates.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo()), INDEXING_TYPES, FLOAT_TYPES);
-        NDArray::registerSpecialUse({&output}, {&indices, &updates});
-    }
-
-    manager.synchronize();
-}
-
-}
-}
-}
-
+*/
         // PointersManager manager(&context, "NativeOps::concat");
         // PointersManager::printDevContentOnDev<int>(vx, 2);
         // PointersManager::printDevContentOnDev<Nd4jLong>(xShapeInfo, 8);
diff --git a/libnd4j/include/ops/declarable/helpers/scatter.h b/libnd4j/include/ops/declarable/helpers/scatter.h
index d0eb76a52..b470285ff 100644
--- a/libnd4j/include/ops/declarable/helpers/scatter.h
+++ b/libnd4j/include/ops/declarable/helpers/scatter.h
@@ -31,6 +31,8 @@ namespace nd4j {
             void scatterND(nd4j::LaunchContext* context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock);
 
             void scatterForLoss(nd4j::LaunchContext* context, const NDArray& indices, NDArray& updates, NDArray& output, const bool calcGrad);
+
+            Nd4jLong checkIndices(nd4j::LaunchContext *context, const NDArray& indices, const NDArray& output, const int axis = -1);
         }
     }
 }
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
index d29d1f0e1..c2e39cab5 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp
@@ -76,6 +76,16 @@ TEST_F(DeclarableOpsTests16, scatter_upd_2) {
     delete result;
 }
 
+TEST_F(DeclarableOpsTests16, scatter_upd_3) {
+
+    NDArray x('c', {10, 3}, nd4j::DataType::FLOAT32);
+    NDArray indices('c', {2}, {20,5}, nd4j::DataType::INT32);
+    NDArray updates('c', {2, 3}, {100,101,102,  200,201,202}, nd4j::DataType::FLOAT32);
+    NDArray output('c', {10, 3}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::scatter_upd op;
+    ASSERT_ANY_THROW(op.execute({&x, &indices, &updates}, {&output}, {}, {}, {true, true}));
+}
 
 TEST_F(DeclarableOpsTests16, test_size_dtype_1) {
     auto x = NDArrayFactory::create<float>('c', {3}, {1, 1, 1});
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
index 4941e7459..076d14385 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests2.cpp
@@ -62,7 +62,7 @@ TEST_F(DeclarableOpsTests2, gather_2) {
 
     nd4j::ops::gather op;
 
-    auto result = op.execute({&input}, {}, {1, 0,1, 2,2, 1,2});
+    auto result = op.execute({&input}, {}, {1, 0,1, 2,2, 1,2}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -125,7 +125,7 @@ TEST_F(DeclarableOpsTests2, gather_5) {
 
     nd4j::ops::gather op;
 
-    auto result = op.execute({&input, &indices}, {}, {1});
+    auto result = op.execute({&input, &indices}, {}, {1}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -294,7 +294,7 @@ TEST_F(DeclarableOpsTests2, gather_13) {
 
     nd4j::ops::gather op;
 
-    auto result = op.execute({&input, &indices}, {}, {2});
+    auto result = op.execute({&input, &indices}, {}, {2}, {true});
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -306,6 +306,30 @@ TEST_F(DeclarableOpsTests2, gather_13) {
     delete result;
 }
 
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests2, gather_14) {
+
+    NDArray input   ('c', {2,3,4},   {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24});
+    NDArray indices ('c', {2,3},     {0, 10, 2, 20, 1,2}, nd4j::DataType::INT32);
+    NDArray output('c', {2,2,3,4});
+
+    nd4j::ops::gather op;
+
+    ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {1}, {true}));
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests2, gather_15) {
+
+    NDArray input   ('c', {2,3,4,5}, nd4j::DataType::DOUBLE);
+    NDArray indices ('c', {2,3,4}, {0, 10, 2, 3, 0, 1, 20, 3, 0, 1, 2, 3,0, 1, 2, 3, 0, 1, 2, 30, 0, 1, 2, 3}, nd4j::DataType::INT32);
+    NDArray output('c', {2,3,  2,3,4,  5});
+
+    nd4j::ops::gather op;
+
+    ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {2}, {true}));
+}
+
 TEST_F(DeclarableOpsTests2, BroadcastGradientArgs_1) {
 
     NDArray input   ('c', {3,3,4},   {1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 13,14,15,16,17,18,19,20,21,22,23,24, 25,26,27,28,29,30,31,32,33,34,35,36}, nd4j::DataType::INT32);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
index 2e8d96f3c..2c15f24bc 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
@@ -704,7 +704,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test2) {
     auto expected = NDArrayFactory::create<double>('c', {2,2,2}, {23, 24, 11, 12, 3,  4, 3,  4});
 
     nd4j::ops::gather_nd op;
-    auto results = op.execute({&input, &indices}, {}, {});
+    auto results = op.execute({&input, &indices}, {}, {}, {true});
     auto output = results->at(0);
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -798,7 +798,7 @@ TEST_F(DeclarableOpsTests5, gatherNd_test7) {
     auto expected = NDArrayFactory::create<double>('c', {3,3}, {3,5,5,8,5,10,2,2,14});
 
     nd4j::ops::gather_nd op;
-    auto results = op.execute({&input, &indices}, {}, {});
+    auto results = op.execute({&input, &indices}, {}, {}, {true});
     auto output = results->at(0);
 
     ASSERT_EQ(Status::OK(), results->status());
@@ -825,6 +825,52 @@ TEST_F(DeclarableOpsTests5, gatherNd_test8) {
     delete result;
 }
 
+TEST_F(DeclarableOpsTests5, gatherNd_test9) {
+    auto x = NDArrayFactory::create<double>('c', {2, 4, 2, 2});
+    auto indices = NDArrayFactory::create<int>('c', {3, 3}, {0,2,1, 0,1,0, 1,3,1});
+    auto exp = NDArrayFactory::create<double>('c', {3,2}, {11.f, 12.f, 5.f, 6.f, 31.f, 32.f});
+    x.linspace(1);
+
+    nd4j::ops::gather_nd op;
+    auto result = op.execute({&x, &indices}, {}, {});
+    ASSERT_EQ(Status::OK(), result->status());
+
+    auto z = result->at(0);
+
+    //z->printIndexedBuffer();
+    //z->printShapeInfo("z shape");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests5, gatherNd_test10) {
+
+    auto input = NDArrayFactory::create<double>('c', {4, 3, 2});
+    auto indices = NDArrayFactory::create<int>('c', {2,2,2}, {30,20,1,2, 0,10,0,1});
+
+    auto output = NDArrayFactory::create<double>('c', {2,2,2});
+
+    nd4j::ops::gather_nd op;
+
+    ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {}, {true}));
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests5, gatherNd_test11) {
+
+    auto input = NDArrayFactory::create<double>('c', {4, 4});
+    auto indices = NDArrayFactory::create<int>('c', {3,3,2}, {0,2,1, 0,10,0, 1,30,1, 0,20,1, 0,1,0, 1,30,1});
+    auto output = NDArrayFactory::create<double>('c', {3,3});
+
+    nd4j::ops::gather_nd op;
+
+    ASSERT_ANY_THROW(op.execute({&input, &indices}, {&output}, {}, {}, {true}));
+}
+
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests5, reverse_sequense_test1) {
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index 79a569e0f..be25f0c62 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -314,27 +314,6 @@ TEST_F(DeclarableOpsTests6, Test_Simple_Scalar_1) {
     delete result;
 }
 
-TEST_F(DeclarableOpsTests6, Test_gatherNd_Edge_1) {
-    auto x = NDArrayFactory::create<double>('c', {2, 4, 2, 2});
-    auto indices = NDArrayFactory::create<int>('c', {3, 3}, {0,2,1, 0,1,0, 1,3,1});
-    auto exp = NDArrayFactory::create<double>('c', {3,2}, {11.f, 12.f, 5.f, 6.f, 31.f, 32.f});
-    x.linspace(1);
-
-    nd4j::ops::gather_nd op;
-    auto result = op.execute({&x, &indices}, {}, {});
-    ASSERT_EQ(Status::OK(), result->status());
-
-    auto z = result->at(0);
-
-    //z->printIndexedBuffer();
-    //z->printShapeInfo("z shape");
-
-    ASSERT_TRUE(exp.isSameShape(z));
-    ASSERT_TRUE(exp.equalsTo(z));
-
-    delete result;
-}
-
 TEST_F(DeclarableOpsTests6, Test_Order_1) {
     auto x = NDArrayFactory::create<double>('f', {2, 3});
     auto exp = NDArrayFactory::create<double>('c', {2, 3});
diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
index d5880d689..17f00011c 100644
--- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
@@ -756,7 +756,7 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_4) {
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {3, 4, 5, 6, 5, 6, 7, 8});
 
     nd4j::ops::scatter_add op;
-    auto result = op.execute({&matrix, &idc, &updates}, {}, {}, {true});
+    auto result = op.execute({&matrix, &idc, &updates}, {}, {}, {true, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
@@ -791,7 +791,7 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_6) {
     auto exp = NDArrayFactory::create<float>('c', {2, 2, 2}, {7, 9, 11, 13, 7, 9, 11, 13});
 
     nd4j::ops::scatter_add op;
-    auto result = op.execute({&matrix, &idc, &updates}, {}, {}, {true});
+    auto result = op.execute({&matrix, &idc, &updates}, {}, {}, {true, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
@@ -837,6 +837,18 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_8) {
     ASSERT_TRUE(expected.equalsTo(z));
 }
 
+////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, Test_Scatter_Add_9) {
+    auto matrix = NDArrayFactory::create<float>('c', {2, 2, 3}, {1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1});
+    NDArray idc('c', {2, 2}, {1, 10, 0, 0}, nd4j::DataType::INT64);
+    auto updates = NDArrayFactory::create<float>('c', {2, 2, 2, 3}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+    auto output = NDArrayFactory::create<float>('c', {2, 2, 3});
+
+    nd4j::ops::scatter_add op;
+
+    ASSERT_ANY_THROW(op.execute({&matrix, &idc, &updates}, {&output}, {}, {}, {true, true}));
+}
+
 ////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterMax_test1) {
     auto matrix = NDArrayFactory::create<float>('c', {2, 2}, {1, 2, 3, 4});
@@ -1010,6 +1022,18 @@ TEST_F(ParityOpsTests, scatterMin_test4) {
     delete result;
 }
 
+////////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatterMin_test5) {
+    auto matrix = NDArrayFactory::create<float>('c', {2, 2, 2}, {1, 2, 3, 4, 5, 6, 7, 8});
+    NDArray idc('c', {1,2}, {10,10}, nd4j::DataType::INT32);
+    auto updates = NDArrayFactory::create<float>('c', {1, 2, 2, 2}, {1,10,1,10, 1,1,10,1.});
+    auto output = NDArrayFactory::create<float>('c', {2, 2, 2});
+
+    nd4j::ops::scatter_min op;
+
+    ASSERT_ANY_THROW(op.execute({&matrix, &idc, &updates}, {&output}, {}, {}, {true, true}));
+}
+
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_test1) {
 
@@ -1019,7 +1043,7 @@ TEST_F(ParityOpsTests, scatterND_test1) {
     auto exp = NDArrayFactory::create<float>('c', {3, 4}, {50.f, 60.f, 70.f, 80.f, 10.f, 20.f, 30.f, 40.f, 0.f,  0.f,  0.f,  0.f});
 
     nd4j::ops::scatter_nd op;
-    auto result = op.execute({&indices, &updates, &shape}, {}, {});
+    auto result = op.execute({&indices, &updates, &shape}, {}, {false, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
@@ -1066,7 +1090,7 @@ TEST_F(ParityOpsTests, scatterND_test3) {
     updates.linspace(1.f);
 
     nd4j::ops::scatter_nd op;
-    auto result = op.execute({&indices, &updates, &shape}, {}, {});
+    auto result = op.execute({&indices, &updates, &shape}, {}, {false, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
@@ -1160,7 +1184,7 @@ TEST_F(ParityOpsTests, scatterND_test7) {
     updates.linspace(1);
 
     nd4j::ops::scatter_nd op;
-    auto result = op.execute({&indices, &updates, &shape}, {}, {}, {true});
+    auto result = op.execute({&indices, &updates, &shape}, {}, {}, {true, true});
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
@@ -1193,6 +1217,20 @@ TEST_F(ParityOpsTests, scatterND_test8) {
     delete result;
 }
 
+////////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatterND_test9) {
+
+    NDArray indices('c', {2, 3, 1}, {0., 20., 7., 30., 6., 90.}, nd4j::DataType::INT32);
+    auto updates = NDArrayFactory::create<float>('c', {2,3, 3,4});
+    auto shape = NDArrayFactory::create<int>('c', {3}, {10, 3, 4});
+    auto output = NDArrayFactory::create<float>('c', {10, 3, 4});
+
+    nd4j::ops::scatter_nd op;
+
+    ASSERT_ANY_THROW(auto result = op.execute({&indices, &updates, &shape}, {&output}, {}, {}, {false, true}));
+}
+
+
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_add_test1) {
 
@@ -1323,6 +1361,19 @@ TEST_F(ParityOpsTests, scatterND_add_test5) {
     delete result;
 }
 
+////////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatterND_add_test6) {
+
+    auto input = NDArrayFactory::create<float>('c', {6, 4});
+    NDArray indices('c', {2, 3, 1}, {50.f, 1.f, 2.f, 3.f, 40.f, 0.f}, nd4j::DataType::INT32);
+    auto updates = NDArrayFactory::create<float>('c', {2,3,4});
+    auto output = NDArrayFactory::create<float>('c', {6,4});
+
+    nd4j::ops::scatter_nd_add op;
+
+    ASSERT_ANY_THROW(op.execute({&input, &indices, &updates}, {&output}, {}, {}, {false, true}));
+}
+
 ////////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatterND_sub_test1) {
 
@@ -1586,6 +1637,19 @@ TEST_F(ParityOpsTests, scatterND_update_test5) {
     delete result;
 }
 
+////////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatterND_update_test6) {
+
+    auto input = NDArrayFactory::create<float>('c', {6, 4});
+    NDArray indices('c', {3, 3, 2}, {0.f,0.f, 10.f,1.f, 20.f,2.f, 30.f,3.f, 40.f,0.f, 50.f,1.f, 0.f,2.f, 1.f,3.f, 2.f,0.f}, nd4j::DataType::INT32);
+    auto updates = NDArrayFactory::create<float>('c', {3,3});
+    auto output = NDArrayFactory::create<float>('c', {6,4});
+
+    nd4j::ops::scatter_nd_update op;
+
+    ASSERT_ANY_THROW(op.execute({&input, &indices, &updates}, {&output}, {}, {}, {true, true}));
+}
+
 //////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatter_update_1) {