diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h
index 8bf0dda73..75b515d4b 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@@ -155,20 +155,20 @@ namespace nd4j {
 #ifndef __JAVACPP_HACK__
         NDArray(std::shared_ptr<DataBuffer> buffer, const ShapeDescriptor& descriptor, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const Nd4jLong offset = 0);
 
-        NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+        NDArray(std::shared_ptr<DataBuffer> buffer, const char order, const std::vector<Nd4jLong> &shape, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
 
 #endif
 
         /**
         *  do not allocate memory, memory for array is passed from outside
         */
-        NDArray(void *buffer, Nd4jLong* shapeInfo, nd4j::LaunchContext * context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
+        NDArray(void *buffer, Nd4jLong* shapeInfo, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
 
         /**
         *  do not allocate memory, memory for array is passed from outside
         *  we suppose the content of both (device and host) buffers is identical
         */
-        NDArray(void *buffer, void *bufferD, Nd4jLong* shapeInfo, nd4j::LaunchContext * context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false, const bool isBuffDAlloc = false);
+        NDArray(void *buffer, void *bufferD, Nd4jLong* shapeInfo, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false, const bool isBuffDAlloc = false);
 
         /**
         *  copy constructor
@@ -189,28 +189,28 @@ namespace nd4j {
         /**
 		*  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         */
-		NDArray(Nd4jLong* shapeInfo, const bool copyStrides = false, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+		NDArray(Nd4jLong* shapeInfo, const bool copyStrides = false, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
 
         /**
         *  constructor creates new NDArray using shape information from "shapeInfo", set all elements in new array to be zeros, if copyStrides is true then use stride values from "shapeInfo", else calculate strides independently
         *  set dtype as array type
         */
-        NDArray(Nd4jLong* shapeInfo, const nd4j::DataType dtype, const bool copyStrides = false, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+        NDArray(Nd4jLong* shapeInfo, const nd4j::DataType dtype, const bool copyStrides = false, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
 
         /**
         *  this constructor creates new array using shape information contained in vector argument
         */
-        NDArray(const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dtype = DOUBLE, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+        NDArray(const char order, const std::vector<Nd4jLong> &shape, nd4j::DataType dtype = DOUBLE, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
 
         /**
         * This constructor creates new array with elements copied from data and using shape information stored in shape, elements from data will be casted to dtype
         */
-        NDArray(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, nd4j::DataType dtype = DOUBLE, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+        NDArray(const char order, const std::vector<Nd4jLong> &shape, const std::vector<double>& data, nd4j::DataType dtype = DOUBLE, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext());
 
         /**
-        *  this constructor creates new array using given buffer (without memory allocating) and shape information stored in shape
+        *  this constructor creates new array using given buffer (without memory allocation) and shape information stored in shape
         */
-        NDArray(void *buffer, const char order, const std::vector<Nd4jLong> &shape,  nd4j::DataType dtype, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext());
+        NDArray(void *buffer, const char order, const std::vector<Nd4jLong> &shape,  nd4j::DataType dtype, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isBuffAlloc = false);
 
         /**
         *  this constructor creates new NDArray with shape matching "other" array,
@@ -221,7 +221,7 @@ namespace nd4j {
         /**
         *  this constructor creates scalar(and set its value = 0) or empty array depending on bool argument isScalar
         */
-        NDArray(nd4j::DataType dtype, nd4j::LaunchContext * context = nd4j::LaunchContext ::defaultContext(), const bool isScalar = true);
+        NDArray(nd4j::DataType dtype, nd4j::LaunchContext* context = nd4j::LaunchContext::defaultContext(), const bool isScalar = true);
 
         /**
          * This method blocks until asynchronous operation finishes
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index fbc57b5cf..6c55ba04f 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -132,9 +132,8 @@ NDArray::NDArray(const NDArray *other, const bool copyStrides, nd4j::LaunchConte
         _buffer = std::make_shared<DataBuffer>(lengthOf() * sizeOfT(), dataType(), getContext()->getWorkspace());
 }
 
-
 ////////////////////////////////////////////////////////////////////////
-NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &shape,  nd4j::DataType dtype, nd4j::LaunchContext * context) {
+NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &shape,  nd4j::DataType dtype, nd4j::LaunchContext * context, const bool isBuffAlloc) {
 
     if (shape.empty())
         throw std::runtime_error("NDArray constructor: input shape is empty !");
@@ -148,7 +147,7 @@ NDArray::NDArray(void* buffer, const char order, const std::vector<Nd4jLong> &sh
 
     setShapeInfo(ShapeDescriptor(dtype, order, shape));
 
-    _buffer = std::make_shared<DataBuffer>(buffer, lengthOf() * sizeOfT(), dataType(), true, getContext()->getWorkspace());
+    _buffer = std::make_shared<DataBuffer>(buffer, lengthOf() * sizeOfT(), dataType(), isBuffAlloc, getContext()->getWorkspace());
 }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index b651ed6ff..80e2cb00e 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -1498,16 +1498,6 @@ void NativeOps::specialConcat(
  * This method saves
  */
 nd4j::TadPack* NativeOps::tadOnlyShapeInfo(Nd4jLong *dXShapeInfo, int *dimension, int dimensionLength) {
-	/*shape::TAD tad;
-	tad.init(dXShapeInfo, dimension, dimensionLength);
-	//tad->setOutputBuffer(target);
-	tad.createTadOnlyShapeInfo();
-	tad.createOffsets();
-
-
-	std::memcpy(reinterpret_cast<void *>(target), tad.tadOnlyShapeInfo, shape::shapeInfoByteLength(tad.tadOnlyShapeInfo));
-	std::memcpy(reinterpret_cast<void *>(offsets), tad.tadOffsets, tad.numTads * sizeof(Nd4jLong));
-	*/
 	auto pack = new TadPack();
 	*pack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(dXShapeInfo, dimension, dimensionLength);
     return pack;
diff --git a/libnd4j/include/helpers/ConstantTadHelper.h b/libnd4j/include/helpers/ConstantTadHelper.h
index 1a48dab07..9eb25cb33 100644
--- a/libnd4j/include/helpers/ConstantTadHelper.h
+++ b/libnd4j/include/helpers/ConstantTadHelper.h
@@ -45,9 +45,9 @@ namespace nd4j {
 
         static ConstantTadHelper* getInstance();
 
-        TadPack& tadForDimensions(Nd4jLong *originalShape, const std::vector<int> &dimensions, const bool keepUnitiesInShape = false);
-        TadPack& tadForDimensions(Nd4jLong *originalShape, int* dimensions, int dimLength, const bool keepUnitiesInShape = false);
-        TadPack& tadForDimensions(Nd4jLong *originalShape, int dimensions, const bool keepUnitiesInShape = false);
+        TadPack& tadForDimensions(const Nd4jLong *originalShape, const std::vector<int> &dimensions, const bool keepUnitiesInShape = false);
+        TadPack& tadForDimensions(const Nd4jLong *originalShape, int* dimensions, int dimLength, const bool keepUnitiesInShape = false);
+        TadPack& tadForDimensions(const Nd4jLong *originalShape, int dimensions, const bool keepUnitiesInShape = false);
         TadPack& tadForDimensions(ShapeDescriptor &descriptor, std::vector<int> &dimensions, const bool keepUnitiesInShape = false);
         TadPack& tadForDimensions(TadDescriptor &descriptor);
     };
diff --git a/libnd4j/include/helpers/GradCheck.h b/libnd4j/include/helpers/GradCheck.h
index 9add1f0bc..cda0b5eae 100644
--- a/libnd4j/include/helpers/GradCheck.h
+++ b/libnd4j/include/helpers/GradCheck.h
@@ -29,20 +29,20 @@ namespace nd4j {
 
 class GradCheck {
 
-    public:        
-        enum LossFunc {MEAN = 0, SUM = 1};    
+    public:
+        enum LossFunc {MEAN = 0, SUM = 1};
     private:
         static constexpr double EPSILON = 1e-5;
         static constexpr double MAXRELERR = 1e-5;
         static constexpr double MINABSERR = 1e-6;
         static void fillGradArrays(const LossFunc loss, const std::vector<NDArray*>& gradArrs);
 
-    
-    public:        
-        
-        /** 
+
+    public:
+
+        /**
         *  performs numerical check of gradients in back prop
-        * 
+        *
         *  opFF - feed forward operation
         *  opBP - back propagation operation
         *  argsHolderFF - argument holder for feed forward operation
@@ -61,7 +61,7 @@ class GradCheck {
 
 
 // //////////////////////////////////////////////////////////////////////////
-// ///// IMLEMENTATION OF INLINE METHODS ///// 
+// ///// IMLEMENTATION OF INLINE METHODS /////
 // //////////////////////////////////////////////////////////////////////////
 
 // template<typename T>
diff --git a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
index fa3d78684..5100ca3ff 100644
--- a/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
+++ b/libnd4j/include/helpers/cpu/ConstantTadHelper.cpp
@@ -38,15 +38,15 @@ namespace nd4j {
         return _INSTANCE;
     }
 
-    TadPack& ConstantTadHelper::tadForDimensions(Nd4jLong *originalShape, int dimension, const bool keepUnitiesInShape) {
+    TadPack& ConstantTadHelper::tadForDimensions(const Nd4jLong *originalShape, int dimension, const bool keepUnitiesInShape) {
         return tadForDimensions(originalShape, &dimension, 1, keepUnitiesInShape);
     }
 
-    TadPack& ConstantTadHelper::tadForDimensions(Nd4jLong *originalShape, const std::vector<int> &dimensions, const bool keepUnitiesInShape) {
+    TadPack& ConstantTadHelper::tadForDimensions(const Nd4jLong *originalShape, const std::vector<int> &dimensions, const bool keepUnitiesInShape) {
         return tadForDimensions(originalShape, const_cast<int *>(dimensions.data()), dimensions.size(), keepUnitiesInShape);
     }
 
-    TadPack& ConstantTadHelper::tadForDimensions(Nd4jLong *originalShape, int* dimensions, int dimLength, const bool keepUnitiesInShape) {
+    TadPack& ConstantTadHelper::tadForDimensions(const Nd4jLong *originalShape, int* dimensions, int dimLength, const bool keepUnitiesInShape) {
         TadDescriptor tadDescriptor(originalShape, dimensions, dimLength, keepUnitiesInShape);
         return tadForDimensions(tadDescriptor);
     }
diff --git a/libnd4j/include/helpers/cuda/ConstantTadHelper.cu b/libnd4j/include/helpers/cuda/ConstantTadHelper.cu
index 0a4bc6115..b9ff8887f 100644
--- a/libnd4j/include/helpers/cuda/ConstantTadHelper.cu
+++ b/libnd4j/include/helpers/cuda/ConstantTadHelper.cu
@@ -42,15 +42,15 @@ namespace nd4j {
         return _INSTANCE;
     }
 
-    TadPack& ConstantTadHelper::tadForDimensions(Nd4jLong *originalShape, int dimension, const bool keepUnitiesInShape) {
+    TadPack& ConstantTadHelper::tadForDimensions(const Nd4jLong *originalShape, int dimension, const bool keepUnitiesInShape) {
         return tadForDimensions(originalShape, &dimension, 1, keepUnitiesInShape);
     }
 
-    TadPack& ConstantTadHelper::tadForDimensions(Nd4jLong *originalShape, const std::vector<int> &dimensions, const bool keepUnitiesInShape) {
+    TadPack& ConstantTadHelper::tadForDimensions(const Nd4jLong *originalShape, const std::vector<int> &dimensions, const bool keepUnitiesInShape) {
         return tadForDimensions(originalShape, const_cast<int *>(dimensions.data()), dimensions.size(), keepUnitiesInShape);
     }
 
-    TadPack& ConstantTadHelper::tadForDimensions(Nd4jLong *originalShape, int* dimensions, int dimLength, const bool keepUnitiesInShape) {
+    TadPack& ConstantTadHelper::tadForDimensions(const Nd4jLong *originalShape, int* dimensions, int dimLength, const bool keepUnitiesInShape) {
         TadDescriptor tadDescriptor(originalShape, dimensions, dimLength, keepUnitiesInShape);
         return tadForDimensions(tadDescriptor);
     }
diff --git a/libnd4j/include/helpers/impl/GradCheck.cpp b/libnd4j/include/helpers/impl/GradCheck.cpp
index 2824fb1f8..8beaaaa12 100644
--- a/libnd4j/include/helpers/impl/GradCheck.cpp
+++ b/libnd4j/include/helpers/impl/GradCheck.cpp
@@ -58,7 +58,7 @@ bool GradCheck::checkGrad(ops::DeclarableOp& opFF, ops::DeclarableOp& opBP, cons
 	const std::vector<NDArray*>& inArrsFF = argsHolderFF.getInArrs();
 	const std::vector<NDArray*>& inArrsBP = argsHolderBP.getInArrs();
 
-	// fill input gradient arrays in accordance to type of loss function
+	// fill input gradient arrays in accordance to kind of loss function
 	fillGradArrays(loss, std::vector<NDArray*>(&inArrsBP[numInArrsFF], &inArrsBP[numInArrsFF + numInGradArrsBP]));
 
 	// beck prop pass
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index a45f7d1d0..b9b519510 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -987,9 +987,10 @@ namespace shape {
     // dimsToExclude - should be sorted in increasing order
     ND4J_EXPORT _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
 
-    // calculate offsets of max-array, these output offsets correspond to one minIdx index of min-array which is sub-array of max-array
+    // calculate offsets of max-array, these offsets correspond to one minIdx index of min-array which is sub-array of max-array
+    // maxOffsets - will contain calculated offsets of max-array, buffer for maxOffsets should be allocated beforehand
     // dimsToExclude - should be sorted in increasing order
-    // memBuff - auxiliary memory buffer (size = 2 * max_rank) for coordinates and increments storing, should be passed from outside
+    // memBuff - auxiliary memory buffer (size = 2 * max_rank) for coordinates and increments storing, should be allocated beforehand
     ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude = nullptr);
 
     // calculates offsets for entities (elements or sub-arrays), shape in context of sub-array means dimensions excluded from outer array
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
index 7157ad709..16062769a 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_hue.cpp
@@ -15,7 +15,8 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
@@ -28,46 +29,35 @@
 namespace nd4j {
 namespace ops {
 
-    DECLARE_TYPES(adjust_hue) {
-        getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
-                ->setSameMode(true);
-    }
 
-    CONFIGURABLE_OP_IMPL(adjust_hue, 1, 1, true, -2, -2) {
-        auto input = INPUT_VARIABLE(0);
-        auto output = OUTPUT_VARIABLE(0);
+CONFIGURABLE_OP_IMPL(adjust_hue, 1, 1, true, 1, -2) {
 
-        REQUIRE_TRUE(input->rankOf() == 3 || input->rankOf() == 4, 0, "AdjustHue: op expects either 3D or 4D input, but got %i instead", input->rankOf());
+    auto input  = INPUT_VARIABLE(0);
+    auto output = OUTPUT_VARIABLE(0);
+
+    const int rank     = input->rankOf();
+    const int dimC     = block.getIArguments()->size() > 0 ? (INT_ARG(0) >= 0 ? INT_ARG(0) : INT_ARG(0) + rank) : rank - 1;
+    const double delta = T_ARG(0);
+
+    REQUIRE_TRUE(rank >= 3, 0, "ADJUST_HUE: op expects rank of input array to be >= 3, but got %i instead", rank);
+    REQUIRE_TRUE(input->sizeAt(dimC) == 3, 0, "ADJUST_HUE: operation expects image with 3 channels (R, G, B), but got %i instead", input->sizeAt(dimC));
+    REQUIRE_TRUE(-1. <= delta && delta <= 1., 0, "ADJUST_HUE: parameter delta must be within [-1, 1] interval, but got %f instead", delta);
+
+    NDArray deltaScalarArr = NDArrayFactory::create<double>(delta, block.launchContext());
+
+    helpers::adjustHue(block.launchContext(), input, &deltaScalarArr, output, dimC);
+
+    return Status::OK();
+}
+
+DECLARE_TYPES(adjust_hue) {
+    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)
+                     ->setSameMode(true);
+}
 
 
-        double delta = 0;
-        if (block.numT() > 0)
-            delta = T_ARG(0);
-        else if (block.width() > 1) {
-            auto _d = INPUT_VARIABLE(1);
-            if (!_d->isScalar()) {
-                auto str = ShapeUtils::shapeAsString(_d);
-                REQUIRE_TRUE(_d->isScalar(), 0, "AdjustHue: delta should be scalar NDArray, but got %s instead", str.c_str());
-            }
-            delta = _d->e<double>(0);
-        }
 
 
-        bool isNHWC = false;
-        if (block.numI() > 0)
-            isNHWC = INT_ARG(0) == 1;
-
-        int numChannels = isNHWC ? input->sizeAt(-1) : input->sizeAt(-3);
-
-        REQUIRE_TRUE(numChannels == 3, 0, "AdjustHue: this operation expects image with 3 channels (R, G, B), but got % instead", numChannels);
-
-        auto ts = NDArrayFactory::create(delta, block.launchContext());
-        // FIXME: delta should be NDArray scalar
-        helpers::_adjust_hue(block.launchContext(), input, output, &ts, isNHWC);
-
-        return Status::OK();
-    }
 }
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
index 6eab17c2d..b4472bef5 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/adjust_saturation.cpp
@@ -27,45 +27,33 @@
 
 namespace nd4j {
 namespace ops {
-    DECLARE_TYPES(adjust_saturation) {
-        getOpDescriptor()
-                ->setAllowedInputTypes(nd4j::DataType::ANY)
-                ->setSameMode(true);
-    }
 
-    CONFIGURABLE_OP_IMPL(adjust_saturation, 1, 1, true, -2, -2) {
-        auto input = INPUT_VARIABLE(0);
-        auto output = OUTPUT_VARIABLE(0);
+CONFIGURABLE_OP_IMPL(adjust_saturation, 1, 1, true, 1, -2) {
 
-        REQUIRE_TRUE(input->rankOf() == 3 || input->rankOf() == 4, 0, "AdjustSaturation: op expects either 3D or 4D input, but got %i instead", input->rankOf());
+    auto input  = INPUT_VARIABLE(0);
+    auto output = OUTPUT_VARIABLE(0);
 
-        double delta = 0;
-        if (block.numT() > 0)
-            delta = T_ARG(0);
-        else if (block.width() > 1) {
-            auto _d = INPUT_VARIABLE(1);
-            if (!_d->isScalar()) {
-                auto str = ShapeUtils::shapeAsString(_d);
-                REQUIRE_TRUE(_d->isScalar(), 0, "AdjustSaturation: delta should be scalar NDArray, but got %s instead", str.c_str());
-            }
+    const int rank     = input->rankOf();
+    const int dimC     = block.getIArguments()->size() > 0 ? (INT_ARG(0) >= 0 ? INT_ARG(0) : INT_ARG(0) + rank) : rank - 1;
+    const double factor = T_ARG(0);
 
-            delta = _d->e<double>(0);
-        }
+    REQUIRE_TRUE(rank >= 3, 0, "ADJUST_SATURATION: op expects rank of input array to be >= 3, but got %i instead", rank);
+    REQUIRE_TRUE(input->sizeAt(dimC) == 3, 0, "ADJUST_SATURATION: operation expects image with 3 channels (R, G, B), but got %i instead", input->sizeAt(dimC));
 
-        bool isNHWC = false;
-        if (block.numI() > 0)
-            isNHWC = INT_ARG(0) == 1;
+    NDArray factorScalarArr = NDArrayFactory::create<double>(factor, block.launchContext());
 
-        int numChannels = isNHWC ? input->sizeAt(-1) : input->sizeAt(-3);
+    helpers::adjustSaturation(block.launchContext(), input, &factorScalarArr, output, dimC);
+
+    return Status::OK();
+}
+
+DECLARE_TYPES(adjust_saturation) {
+    getOpDescriptor()->setAllowedInputTypes(nd4j::DataType::ANY)
+                     ->setSameMode(true);
+}
 
-        REQUIRE_TRUE(numChannels == 3, 0, "AdjustSaturation: this operation expects image with 3 channels (R, G, B), but got % instead", numChannels);
 
-        auto ts = NDArrayFactory::create(delta, block.launchContext());
-        // FIXME: delta should be NDArray scalar
-        helpers::adjust_saturation(block.launchContext(), input, output, &ts, isNHWC);
 
-        return Status::OK();
-    }
 }
 }
 
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
index 9dbb68aed..c28038221 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/scatter_add.cpp
@@ -16,7 +16,7 @@
 
 //
 // @author Created by raver119 on 24.11.17.
-// @author Yurii Shyrma (iuriish@yahoo.com)    
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
@@ -26,64 +26,67 @@
 #include <ops/declarable/generic/helpers/ScatterHelper.h>
 
 namespace nd4j {
-    namespace ops {
-        OP_IMPL(scatter_add, 3, 1, true) {
-            auto input = INPUT_VARIABLE(0);
-            auto indices = INPUT_VARIABLE(1);
-            auto updates = INPUT_VARIABLE(2);
+namespace ops {
 
-            auto output = OUTPUT_VARIABLE(0);
+OP_IMPL(scatter_add, 3, 1, true) {
+    auto input = INPUT_VARIABLE(0);
+    auto indices = INPUT_VARIABLE(1);
+    auto updates = INPUT_VARIABLE(2);
 
-            const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
+    auto output = OUTPUT_VARIABLE(0);
 
-            const int inRank  = input->rankOf();
-            const int indRank = indices->rankOf();
-            const int updRank = updates->rankOf();
-            const Nd4jLong indLen = indices->lengthOf();
-    
-            REQUIRE_TRUE(inRank > 0, 0, "SCATTER_ADD OP: input should not be scalar !");
-    
-            if(inRank == 1) {
-                REQUIRE_TRUE(indices->isSameShape(updates), 0, "SCATTER_ADD OP: when input array has rank = 1 then indices and updates must have the same shapes, but got %s and %s correspondingly !", ShapeUtils::shapeAsString(indices).c_str(), ShapeUtils::shapeAsString(updates).c_str());
-            }
-            else if (inRank == updRank && indices->isVector()) {
+    const bool lock = block.getBArguments()->empty() ? false : B_ARG(0);
 
-                std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
-                std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
-                std::vector<Nd4jLong> expectedUpdShape = {indices->lengthOf()}; 
-                expectedUpdShape.insert(expectedUpdShape.end(), inShape.begin()+1, inShape.end());
-        
-                REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ADD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
-            }
-            else {
+    const int inRank  = input->rankOf();
+    const int indRank = indices->rankOf();
+    const int updRank = updates->rankOf();
+    const Nd4jLong indLen = indices->lengthOf();
 
-                REQUIRE_TRUE(updRank == indRank + inRank - 1, 0, "SCATTER_ADD OP: wrong rank of updates array, expected is %i, but got %i instead !", indRank + inRank - 1 , updRank);
-                
-                std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
-                std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
-                std::vector<Nd4jLong> expectedUpdShape = indices->getShapeAsVector();        
-                expectedUpdShape.insert(expectedUpdShape.end(), inShape.begin() + Nd4jLong(1L), inShape.end());
+    REQUIRE_TRUE(inRank > 0, 0, "SCATTER_ADD OP: input should not be scalar !");
 
-                REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ADD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
-            }
+    if(inRank == 1) {
+        REQUIRE_TRUE(indices->isSameShape(updates), 0, "SCATTER_ADD OP: when input array has rank = 1 then indices and updates must have the same shapes, but got %s and %s correspondingly !", ShapeUtils::shapeAsString(indices).c_str(), ShapeUtils::shapeAsString(updates).c_str());
+    }
+    else if (inRank == updRank && indices->isVector()) {
 
-            if (!block.isInplace())
-                output->assign(input);
+        std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
+        std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
+        std::vector<Nd4jLong> expectedUpdShape = {indices->lengthOf()};
+        expectedUpdShape.insert(expectedUpdShape.end(), inShape.begin()+1, inShape.end());
 
-            helpers::scatter(block.launchContext(), pairwise::Add, *indices, *updates, *output, lock);
+        REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ADD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
+    }
+    else {
 
-            return Status::OK();
-        }
-        DECLARE_SYN(ScatterAdd, scatter_add);
+        REQUIRE_TRUE(updRank == indRank + inRank - 1, 0, "SCATTER_ADD OP: wrong rank of updates array, expected is %i, but got %i instead !", indRank + inRank - 1 , updRank);
+
+        std::vector<Nd4jLong> updShape = updates->getShapeAsVector();
+        std::vector<Nd4jLong> inShape  = input->getShapeAsVector();
+        std::vector<Nd4jLong> expectedUpdShape = indices->getShapeAsVector();
+        expectedUpdShape.insert(expectedUpdShape.end(), inShape.begin() + Nd4jLong(1L), inShape.end());
+
+        REQUIRE_TRUE(expectedUpdShape == updShape, 0, "SCATTER_ADD OP: wrong shape of updates array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expectedUpdShape).c_str(), ShapeUtils::shapeAsString(updShape).c_str());
     }
 
-    DECLARE_TYPES(scatter_add) {
-        getOpDescriptor()
-            ->setAllowedInputTypes(0, {ALL_INTS, ALL_FLOATS})
-            ->setAllowedInputTypes(1, {ALL_INTS})
-            ->setAllowedInputTypes(2, {ALL_INTS, ALL_FLOATS})
-            ->setAllowedOutputTypes({ALL_INTS, ALL_FLOATS});
-    }
+    if (!block.isInplace())
+        output->assign(input);
+
+    helpers::scatter(block.launchContext(), pairwise::Add, *indices, *updates, *output, lock);
+
+    return Status::OK();
+}
+
+DECLARE_SYN(ScatterAdd, scatter_add);
+
+DECLARE_TYPES(scatter_add) {
+    getOpDescriptor()
+        ->setAllowedInputTypes(0, {ALL_INTS, ALL_FLOATS})
+        ->setAllowedInputTypes(1, {ALL_INTS})
+        ->setAllowedInputTypes(2, {ALL_INTS, ALL_FLOATS})
+        ->setAllowedOutputTypes({ALL_INTS, ALL_FLOATS});
+}
+
+}
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp
index 26000d039..e7694b409 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/sequence_mask.cpp
@@ -57,16 +57,26 @@ namespace nd4j {
             auto in = inputShape->at(0);
             int outRank = shape::rank(in) + 1;
             auto input = INPUT_VARIABLE(0);
+            auto dtype = DataType::BOOL;
             Nd4jLong maxInd = input->argMax();
-            float max = input->e<float>(maxInd);
+            Nd4jLong max = input->e<Nd4jLong>(maxInd);
+
             if (block.getIArguments()->size() > 0) {
+                if (block.width() < 2) {
                 maxInd = INT_ARG(0);
                 if (maxInd < max)
                     maxInd = static_cast<Nd4jLong>(max);
+                if (block.getIArguments()->size() > 1)
+                    dtype = (DataType)INT_ARG(1);
+                }
+                else {
+                    dtype = (DataType)INT_ARG(0);
+                }
             }
-            else if (block.width() > 1) {
+
+            if (block.width() > 1) {
                 auto maxlen = INPUT_VARIABLE(1);
-                float tmaxlen = maxlen->e<float>(0);
+                Nd4jLong tmaxlen = maxlen->e<Nd4jLong>(0);
                 if (tmaxlen > max)
                     maxInd = static_cast<Nd4jLong>(tmaxlen);
             }
@@ -80,14 +90,14 @@ namespace nd4j {
                 outShapeInfo[i + 1] = shape::sizeAt(in, i);
             outShapeInfo[outRank] = lastDimension;
 
-            ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
+            ShapeUtils::updateStridesAndType(outShapeInfo, dtype, shape::order(in));
 
             return SHAPELIST(CONSTANT(outShapeInfo));
     }
 
         DECLARE_TYPES(sequence_mask) {
             getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
+                    ->setAllowedInputTypes({ALL_INTS})
                     ->setAllowedOutputTypes(nd4j::DataType::ANY);
         }
 }
diff --git a/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp b/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp
index 1c69dd38b..e9eb73dbf 100644
--- a/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp
+++ b/libnd4j/include/ops/declarable/generic/random/random_shuffle.cpp
@@ -33,11 +33,11 @@ OP_IMPL(random_shuffle, 1, 1, true) {
     const bool isInplace = block.isInplace();
     auto output = isInplace ? nullptr : OUTPUT_VARIABLE(0);
 
-    nd4j::random::RandomBuffer* rng = block.getRNG();   
-    
-    REQUIRE_TRUE(rng != nullptr, 0, "RANDOM_SHUFFLE op: RNG should be defined in Graph !");
+//    nd4j::random::RandomBuffer* rng = block.getRNG();
+    nd4j::graph::RandomGenerator rng = block.randomGenerator();
+//    REQUIRE_TRUE(rng != nullptr, 0, "RANDOM_SHUFFLE op: RNG should be defined in Graph !");
 
-    helpers::randomShuffle(block.launchContext(), *input, *output, *rng, isInplace);
+    helpers::randomShuffle(block.launchContext(), *input, *output, rng, isInplace);
     
     return Status::OK();
 }
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp b/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp
index 32642b37b..5ddd1654e 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/gruCell.cpp
@@ -31,17 +31,18 @@ namespace ops  {
 
 //////////////////////////////////////////////////////////////////////////
 CUSTOM_OP_IMPL(gruCell, 6, 4, false, 0, 0) {
-    auto x      = INPUT_VARIABLE(0);                   // input [bS, nIn], nIn - input size
-    auto hLast  = INPUT_VARIABLE(1);                   // previous cell output [bS, nU],  that is at previous time step t-1, nU - number of units
-    auto Wru    = INPUT_VARIABLE(2);                   // RU weights - [nIn+nU, 2*nU] - reset and update gates (input/recurrent weights)
-    auto Wc     = INPUT_VARIABLE(3);                   // C weights - [nIn+nU, nU] - cell gate (input/recurrent weights)
-    auto bru    = INPUT_VARIABLE(4);                   // reset and update biases, [2*nU] - reset and update gates
-    auto bc     = INPUT_VARIABLE(5);                   // cell biases, [nU]
 
-    auto r    =  OUTPUT_VARIABLE(0);                  // Reset gate output [bS, nU]
-    auto u    =  OUTPUT_VARIABLE(1);                  // Update gate output [bS, nU]
-    auto c    =  OUTPUT_VARIABLE(2);                  // Cell gate output [bS, nU]
-    auto h    =  OUTPUT_VARIABLE(3);                  // current cell output [bS, nU]
+    auto x     = INPUT_VARIABLE(0);                   // input [bS, nIn], nIn - input size
+    auto hLast = INPUT_VARIABLE(1);                   // previous cell output [bS, nU],  that is at previous time step t-1, nU - number of units
+    auto Wru   = INPUT_VARIABLE(2);                   // RU weights - [nIn+nU, 2*nU] - reset and update gates (input/recurrent weights)
+    auto Wc    = INPUT_VARIABLE(3);                   // C weights - [nIn+nU, nU] - cell gate (input/recurrent weights)
+    auto bru   = INPUT_VARIABLE(4);                   // reset and update biases, [2*nU] - reset and update gates
+    auto bc    = INPUT_VARIABLE(5);                   // cell biases, [nU]
+
+    auto r     =  OUTPUT_VARIABLE(0);                  // Reset gate output [bS, nU]
+    auto u     =  OUTPUT_VARIABLE(1);                  // Update gate output [bS, nU]
+    auto c     =  OUTPUT_VARIABLE(2);                  // Cell gate output [bS, nU]
+    auto h     =  OUTPUT_VARIABLE(3);                  // current cell output [bS, nU]
 
     REQUIRE_TRUE(x->rankOf()==2 && hLast->rankOf()==2, 0, "gruCell: Input ranks must be 2 for inputs 0 and 1 (x, hLast) - got %i, %i", x->rankOf(), hLast->rankOf());
 
@@ -118,65 +119,58 @@ DECLARE_SHAPE_FN(gruCell) {
 
 
 //////////////////////////////////////////////////////////////////////////
-CUSTOM_OP_IMPL(gruCell_bp, 6, 5, false, 0, 0) {
+CUSTOM_OP_IMPL(gruCell_bp, 10, 6, false, 0, 0) {
 
-    auto x      = INPUT_VARIABLE(0);                                 // input [bS x iS]
-    auto hi     = INPUT_VARIABLE(1);                                 // previous cell output [bS x nU]
-    auto Wx     = INPUT_VARIABLE(2);                                 // input-to-hidden  weights, [iS x 3*nU]
-    auto Wh     = INPUT_VARIABLE(3);                                 // hidden-to-hidden weights, [nU x 3*nU]
-    auto b      = INPUT_VARIABLE(4);                                 // biases, [3*nU]
-    auto dLdh   = INPUT_VARIABLE(5);                                 // gradient wrt output, [bS,nU], that is epsilon_next
-    auto dLdWxi = block.width() > 6 ? INPUT_VARIABLE(6) : nullptr;   // gradient wrt Wx at previous time step, [iS, 3*nU]
-    auto dLdWhi = block.width() > 7 ? INPUT_VARIABLE(7) : nullptr;   // gradient wrt Wh at previous time step, [nU, 3*nU]
-    auto dLdbi  = block.width() > 8 ? INPUT_VARIABLE(8) : nullptr;   // gradient wrt b at previous time step,  [3*nU]
+    auto x      = INPUT_VARIABLE(0);                                // input [bS x iS]
+    auto hi     = INPUT_VARIABLE(1);                                // previous cell output [bS x nU]
+    auto W      = INPUT_VARIABLE(2);                                // weights, [iS+nU x 2*nU]
+    auto Wc     = INPUT_VARIABLE(3);                                // c weights, [iS+nU x nU]
+    auto b      = INPUT_VARIABLE(4);                                // biases, [2*nU]
+    auto bc     = INPUT_VARIABLE(5);                                // biases, [nU]
+    auto dLdr   = INPUT_VARIABLE(6);                                // gradient wrt reset gate, [bS, nU]
+    auto dLdu   = INPUT_VARIABLE(7);                                // gradient wrt update gate, [bS, nU]
+    auto dLdc   = INPUT_VARIABLE(8);                                // gradient wrt cell state, [bS, nU]
+    auto dLdh   = INPUT_VARIABLE(9);                                // gradient wrt current cell output, [bS, nU]
 
-    auto dLdx   = OUTPUT_VARIABLE(0);                                // gradient wrt x,  [bS, iS], that is epsilon
-    auto dLdhi  = OUTPUT_VARIABLE(1);                                // gradient wrt hi, [bS, nU]
-    auto dLdWx  = OUTPUT_VARIABLE(2);                                // gradient wrt Wx, [iS, 3*nU]
-    auto dLdWh  = OUTPUT_VARIABLE(3);                                // gradient wrt Wh, [nU, 3*nU]
-    auto dLdb   = OUTPUT_VARIABLE(4);                                // gradient wrt biases,  [3*nU]
+    auto dLdx   = OUTPUT_VARIABLE(0);                               // gradient wrt x,  [bS, iS]
+    auto dLdhi  = OUTPUT_VARIABLE(1);                               // gradient wrt hi, [bS, nU]
+    auto dLdW   = OUTPUT_VARIABLE(2);                               // gradient wrt W,  [iS+nU x 2*nU]
+    auto dLdWc  = OUTPUT_VARIABLE(3);                               // gradient wrt Wc, [iS+nU x nU]
+    auto dLdb   = OUTPUT_VARIABLE(4);                               // gradient wrt biases, [2*nU]
+    auto dLdbc  = OUTPUT_VARIABLE(5);                               // gradient wrt c biases, [nU]
 
-    const int rank     = x->rankOf();                               // = 2
-    const Nd4jLong bS  = x->sizeAt(0);
-    const Nd4jLong iS  = x->sizeAt(1);
-    const Nd4jLong nU  = hi->sizeAt(1);
+    const Nd4jLong bS = x->sizeAt(0);
+    const Nd4jLong iS = x->sizeAt(1);
+    const Nd4jLong nU = hi->sizeAt(1);
 
-    const std::string hiShape          = ShapeUtils::shapeAsString(hi);
-    const std::string hiCorrectShape   = ShapeUtils::shapeAsString({bS, nU});
-    const std::string wxShape          = ShapeUtils::shapeAsString(Wx);
-    const std::string wxCorrectShape   = ShapeUtils::shapeAsString({iS, 3*nU});
-    const std::string whShape          = ShapeUtils::shapeAsString(Wh);
-    const std::string whCorrectShape   = ShapeUtils::shapeAsString({nU, 3*nU});
-    const std::string bShape           = ShapeUtils::shapeAsString(b);
-    const std::string bCorrectShape    = ShapeUtils::shapeAsString({3*nU});
-    const std::string dLdhShape        = ShapeUtils::shapeAsString(dLdh);
-    const std::string dLdhCorrectShape = ShapeUtils::shapeAsString({bS, nU});
+    REQUIRE_TRUE(x->rankOf() == 2, 0, "GRU_CELL_BP: rank of input array x must be 2, but got %i instead", x->rankOf());
 
-    REQUIRE_TRUE(hiShape   == hiCorrectShape,    0, "GRU_CELL_BP op: wrong shape of previous cell output array, expected is %s, but got %s instead !", hiCorrectShape.c_str(), hiShape.c_str());
-    REQUIRE_TRUE(wxShape   == wxCorrectShape,    0, "GRU_CELL_BP op: wrong shape of input-to-hidden weights array, expected is %s, but got %s instead !", wxCorrectShape.c_str(), wxShape.c_str());
-    REQUIRE_TRUE(whShape   == whCorrectShape,    0, "GRU_CELL_BP op: wrong shape of hidden-to-hidden weights array, expected is %s, but got %s instead !", whCorrectShape.c_str(), whShape.c_str());
-    REQUIRE_TRUE(bShape    == bCorrectShape,     0, "GRU_CELL_BP op: wrong shape of biases array, expected is %s, but got %s instead !", bCorrectShape.c_str(), bShape.c_str());
-    REQUIRE_TRUE(dLdhShape == dLdhCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdh array (epsilon_next), expected is %s, but got %s instead !", dLdhCorrectShape.c_str(), dLdhShape.c_str());
+    const std::string hiShape        = ShapeUtils::shapeAsString(hi);
+    const std::string hiCorrectShape = ShapeUtils::shapeAsString({bS, nU});
+    const std::string wShape         = ShapeUtils::shapeAsString(W);
+    const std::string wCorrectShape  = ShapeUtils::shapeAsString({iS+nU, 2*nU});
+    const std::string wcShape        = ShapeUtils::shapeAsString(Wc);
+    const std::string wcCorrectShape = ShapeUtils::shapeAsString({iS+nU, nU});
+    const std::string bShape         = ShapeUtils::shapeAsString(b);
+    const std::string bCorrectShape  = ShapeUtils::shapeAsString({2*nU});
+    const std::string bcShape        = ShapeUtils::shapeAsString(bc);
+    const std::string bcCorrectShape = ShapeUtils::shapeAsString({nU});
+    const std::string dLdrShape      = ShapeUtils::shapeAsString(dLdr);
+    const std::string dLduShape      = ShapeUtils::shapeAsString(dLdu);
+    const std::string dLdcShape      = ShapeUtils::shapeAsString(dLdc);
+    const std::string dLdhShape      = ShapeUtils::shapeAsString(dLdh);
 
-    if(dLdWxi != nullptr) {
-        const std::string dLdWxiShape        = ShapeUtils::shapeAsString(dLdWxi);
-        const std::string dLdWxiCorrectShape = ShapeUtils::shapeAsString({iS, 3*nU});
-        REQUIRE_TRUE(dLdWxiShape == dLdWxiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdWxi array (gradient wrt Wx at previous time step), expected is %s, but got %s instead !", dLdWxiCorrectShape.c_str(), dLdWxiShape.c_str());
-    }
+    REQUIRE_TRUE(hiShape   == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of previous cell output array, expected is %s, but got %s instead !", hiCorrectShape.c_str(), hiShape.c_str());
+    REQUIRE_TRUE(wShape    == wCorrectShape,   0, "GRU_CELL_BP op: wrong shape of weights array, expected is %s, but got %s instead !", wCorrectShape.c_str(), wShape.c_str());
+    REQUIRE_TRUE(wcShape   == wcCorrectShape,  0, "GRU_CELL_BP op: wrong shape of c weights array, expected is %s, but got %s instead !", wcCorrectShape.c_str(), wcShape.c_str());
+    REQUIRE_TRUE(bShape    == bCorrectShape,   0, "GRU_CELL_BP op: wrong shape of biases array, expected is %s, but got %s instead !", bCorrectShape.c_str(), bShape.c_str());
+    REQUIRE_TRUE(bcShape   == bcCorrectShape,  0, "GRU_CELL_BP op: wrong shape of c biases array, expected is %s, but got %s instead !", bcCorrectShape.c_str(), bcShape.c_str());
+    REQUIRE_TRUE(dLdrShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdr array (gradient wrt reset gate), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLdrShape.c_str());
+    REQUIRE_TRUE(dLduShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdu array (gradient wrt update gate), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLduShape.c_str());
+    REQUIRE_TRUE(dLdcShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdc array (gradient wrt cell state), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLdcShape.c_str());
+    REQUIRE_TRUE(dLdhShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdh array (gradient wrt current cell output), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLdhShape.c_str());
 
-    if(dLdWhi != nullptr) {
-        const std::string dLdWhiShape        = ShapeUtils::shapeAsString(dLdWhi);
-        const std::string dLdWhiCorrectShape = ShapeUtils::shapeAsString({nU, 3*nU});
-        REQUIRE_TRUE(dLdWhiShape == dLdWhiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdWhi array (gradient wrt Wh at previous time step), expected is %s, but got %s instead !", dLdWhiCorrectShape.c_str(), dLdWhiShape.c_str());
-    }
-
-    if(dLdbi != nullptr) {
-        const std::string dLdbiShape        = ShapeUtils::shapeAsString(dLdbi);
-        const std::string dLdbiCorrectShape = ShapeUtils::shapeAsString({3*nU});
-        REQUIRE_TRUE(dLdbiShape == dLdbiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdbi array (gradient wrt biases at previous time step), expected is %s, but got %s instead !", dLdbiCorrectShape.c_str(), dLdbiShape.c_str());
-    }
-
-    helpers::gruCellBP(block.launchContext(), x,  hi, Wx, Wh, b, dLdh, dLdWxi, dLdWhi, dLdbi, dLdx, dLdhi, dLdWx, dLdWh, dLdb);
+    helpers::gruCellBP(block.launchContext(), x, hi, W, Wc, b, bc, dLdr, dLdu, dLdc, dLdh, dLdx, dLdhi, dLdW, dLdWc, dLdb, dLdbc);
 
     return Status::OK();
 }
@@ -192,60 +186,54 @@ DECLARE_TYPES(gruCell_bp) {
         ->setAllowedInputTypes(6, {ALL_FLOATS})
         ->setAllowedInputTypes(7, {ALL_FLOATS})
         ->setAllowedInputTypes(8, {ALL_FLOATS})
+        ->setAllowedInputTypes(9, {ALL_FLOATS})
         ->setAllowedOutputTypes({ALL_FLOATS});
 }
 
 DECLARE_SHAPE_FN(gruCell_bp) {
 
-    auto xShapeInfo      = inputShape->at(0);                                              // [bS x iS]
-    auto hiShapeInfo     = inputShape->at(1);                                              // [bS x nU]
-    auto wxShapeInfo     = inputShape->at(2);                                              // [iS x 3*nU]
-    auto whShapeInfo     = inputShape->at(3);                                              // [nU x 3*nU]
-    auto bShapeInfo      = inputShape->at(4);                                              // [3*nU]
-    auto dLdhShapeInfo   = inputShape->at(5);                                              // [bS x nU]
+    auto xShapeInfo    = inputShape->at(0);                          // [bS x iS]
+    auto hiShapeInfo   = inputShape->at(1);                          // [bS x nU]
+    auto wShapeInfo    = inputShape->at(2);                          // [iS+nU x 2*nU]
+    auto wcShapeInfo   = inputShape->at(3);                          // [iS+nU x nU]
+    auto bShapeInfo    = inputShape->at(4);                          // [2*nU]
+    auto bcShapeInfo   = inputShape->at(5);                          // [nU]
+    auto dLdrShapeInfo = inputShape->at(6);                          // [bS, nU]
+    auto dLduShapeInfo = inputShape->at(7);                          // [bS, nU]
+    auto dLdcShapeInfo = inputShape->at(8);                          // [bS, nU]
+    auto dLdhShapeInfo = inputShape->at(9);                          // [bS, nU]
 
     const int rank    = xShapeInfo[0];                               // = 2
     const Nd4jLong bS = xShapeInfo[1];
     const Nd4jLong iS = xShapeInfo[2];
     const Nd4jLong nU = hiShapeInfo[2];
 
-    const std::string hiShape          = ShapeUtils::shapeAsString(hiShapeInfo);
-    const std::string hiCorrectShape   = ShapeUtils::shapeAsString({bS, nU});
-    const std::string wxShape          = ShapeUtils::shapeAsString(wxShapeInfo);
-    const std::string wxCorrectShape   = ShapeUtils::shapeAsString({iS, 3*nU});
-    const std::string whShape          = ShapeUtils::shapeAsString(whShapeInfo);
-    const std::string whCorrectShape   = ShapeUtils::shapeAsString({nU, 3*nU});
-    const std::string bShape           = ShapeUtils::shapeAsString(bShapeInfo);
-    const std::string bCorrectShape    = ShapeUtils::shapeAsString({3*nU});
-    const std::string dLdhShape        = ShapeUtils::shapeAsString(dLdhShapeInfo);
-    const std::string dLdhCorrectShape = ShapeUtils::shapeAsString({bS, nU});
+    REQUIRE_TRUE(xShapeInfo[0] == 2, 0, "GRU_CELL_BP: rank of input array x must be 2, but got %i instead", xShapeInfo[0]);
 
-    REQUIRE_TRUE(hiShape   == hiCorrectShape,    0, "GRU_CELL_BP op: wrong shape of previous cell output array, expected is %s, but got %s instead !", hiCorrectShape.c_str(), hiShape.c_str());
-    REQUIRE_TRUE(wxShape   == wxCorrectShape,    0, "GRU_CELL_BP op: wrong shape of input-to-hidden weights array, expected is %s, but got %s instead !", wxCorrectShape.c_str(), wxShape.c_str());
-    REQUIRE_TRUE(whShape   == whCorrectShape,    0, "GRU_CELL_BP op: wrong shape of hidden-to-hidden weights array, expected is %s, but got %s instead !", whCorrectShape.c_str(), whShape.c_str());
-    REQUIRE_TRUE(bShape    == bCorrectShape,     0, "GRU_CELL_BP op: wrong shape of biases array, expected is %s, but got %s instead !", bCorrectShape.c_str(), bShape.c_str());
-    REQUIRE_TRUE(dLdhShape == dLdhCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdh array (epsilon_next), expected is %s, but got %s instead !", dLdhCorrectShape.c_str(), dLdhShape.c_str());
+    const std::string hiShape        = ShapeUtils::shapeAsString(hiShapeInfo);
+    const std::string hiCorrectShape = ShapeUtils::shapeAsString({bS, nU});
+    const std::string wShape         = ShapeUtils::shapeAsString(wShapeInfo);
+    const std::string wCorrectShape  = ShapeUtils::shapeAsString({iS+nU, 2*nU});
+    const std::string wcShape        = ShapeUtils::shapeAsString(wcShapeInfo);
+    const std::string wcCorrectShape = ShapeUtils::shapeAsString({iS+nU, nU});
+    const std::string bShape         = ShapeUtils::shapeAsString(bShapeInfo);
+    const std::string bCorrectShape  = ShapeUtils::shapeAsString({2*nU});
+    const std::string bcShape        = ShapeUtils::shapeAsString(bcShapeInfo);
+    const std::string bcCorrectShape = ShapeUtils::shapeAsString({nU});
+    const std::string dLdrShape      = ShapeUtils::shapeAsString(dLdrShapeInfo);
+    const std::string dLduShape      = ShapeUtils::shapeAsString(dLduShapeInfo);
+    const std::string dLdcShape      = ShapeUtils::shapeAsString(dLdcShapeInfo);
+    const std::string dLdhShape      = ShapeUtils::shapeAsString(dLdhShapeInfo);
 
-    if(block.width() > 6) {
-        Nd4jLong* dLdWxiShapeInfo = inputShape->at(6);                                              // [iS x 3*nU]
-        const std::string dLdWxiShape        = ShapeUtils::shapeAsString(dLdWxiShapeInfo);
-        const std::string dLdWxiCorrectShape = ShapeUtils::shapeAsString({iS, 3*nU});
-        REQUIRE_TRUE(dLdWxiShape == dLdWxiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdWxi array (gradient wrt Wx at previous time step), expected is %s, but got %s instead !", dLdWxiCorrectShape.c_str(), dLdWxiShape.c_str());
-    }
-
-    if(block.width() > 7) {
-        Nd4jLong* dLdWhiShapeInfo = inputShape->at(7);                                              // [nU x 3*nU]
-        const std::string dLdWhiShape        = ShapeUtils::shapeAsString(dLdWhiShapeInfo);
-        const std::string dLdWhiCorrectShape = ShapeUtils::shapeAsString({nU, 3*nU});
-        REQUIRE_TRUE(dLdWhiShape == dLdWhiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdWhi array (gradient wrt Wh at previous time step), expected is %s, but got %s instead !", dLdWhiCorrectShape.c_str(), dLdWhiShape.c_str());
-    }
-
-    if(block.width() > 8) {
-        Nd4jLong* dLdbiShapeInfo  = inputShape->at(8);                                              // [3*nU]
-        const std::string dLdbiShape        = ShapeUtils::shapeAsString(dLdbiShapeInfo);
-        const std::string dLdbiCorrectShape = ShapeUtils::shapeAsString({3*nU});
-        REQUIRE_TRUE(dLdbiShape == dLdbiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdbi array (gradient wrt biases at previous time step), expected is %s, but got %s instead !", dLdbiCorrectShape.c_str(), dLdbiShape.c_str());
-    }
+    REQUIRE_TRUE(hiShape   == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of previous cell output array, expected is %s, but got %s instead !", hiCorrectShape.c_str(), hiShape.c_str());
+    REQUIRE_TRUE(wShape    == wCorrectShape,   0, "GRU_CELL_BP op: wrong shape of weights array, expected is %s, but got %s instead !", wCorrectShape.c_str(), wShape.c_str());
+    REQUIRE_TRUE(wcShape   == wcCorrectShape,  0, "GRU_CELL_BP op: wrong shape of c weights array, expected is %s, but got %s instead !", wcCorrectShape.c_str(), wcShape.c_str());
+    REQUIRE_TRUE(bShape    == bCorrectShape,   0, "GRU_CELL_BP op: wrong shape of biases array, expected is %s, but got %s instead !", bCorrectShape.c_str(), bShape.c_str());
+    REQUIRE_TRUE(bcShape   == bcCorrectShape,  0, "GRU_CELL_BP op: wrong shape of c biases array, expected is %s, but got %s instead !", bcCorrectShape.c_str(), bcShape.c_str());
+    REQUIRE_TRUE(dLdrShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdr array (gradient wrt reset gate), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLdrShape.c_str());
+    REQUIRE_TRUE(dLduShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdu array (gradient wrt update gate), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLduShape.c_str());
+    REQUIRE_TRUE(dLdcShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdc array (gradient wrt cell state), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLdcShape.c_str());
+    REQUIRE_TRUE(dLdhShape == hiCorrectShape,  0, "GRU_CELL_BP op: wrong shape of dLdh array (gradient wrt current cell output), expected is %s, but got %s instead !", hiCorrectShape.c_str(), dLdhShape.c_str());
 
     Nd4jLong *dLdxShapeInfo = nullptr;
     COPY_SHAPE(xShapeInfo, dLdxShapeInfo);
@@ -253,17 +241,19 @@ DECLARE_SHAPE_FN(gruCell_bp) {
     Nd4jLong *dLdhiShapeInfo = nullptr;
     COPY_SHAPE(hiShapeInfo, dLdhiShapeInfo);
 
-    Nd4jLong *dLdWxShapeInfo = nullptr;
-    COPY_SHAPE(wxShapeInfo, dLdWxShapeInfo);
+    Nd4jLong *dLdWShapeInfo = nullptr;
+    COPY_SHAPE(wShapeInfo, dLdWShapeInfo);
 
-    Nd4jLong *dLdWhShapeInfo = nullptr;
-    COPY_SHAPE(whShapeInfo, dLdWhShapeInfo);
+    Nd4jLong *dLdWcShapeInfo = nullptr;
+    COPY_SHAPE(wcShapeInfo, dLdWcShapeInfo);
 
     Nd4jLong *dLdbShapeInfo = nullptr;
     COPY_SHAPE(bShapeInfo, dLdbShapeInfo);
 
-    return SHAPELIST(dLdxShapeInfo, dLdhiShapeInfo, dLdWxShapeInfo, dLdWhShapeInfo, dLdbShapeInfo);
+    Nd4jLong *dLdbcShapeInfo = nullptr;
+    COPY_SHAPE(bcShapeInfo, dLdbcShapeInfo);
 
+    return SHAPELIST(dLdxShapeInfo, dLdhiShapeInfo, dLdWShapeInfo, dLdWcShapeInfo, dLdbShapeInfo, dLdbcShapeInfo);
 }
 
 
diff --git a/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp b/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp
index 0ddefbece..7eea9e7f2 100644
--- a/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp
+++ b/libnd4j/include/ops/declarable/generic/recurrent/staticRNN.cpp
@@ -45,13 +45,13 @@ CUSTOM_OP_IMPL(static_rnn, 4, 2, false, 0, 0) {
 	else if(block.width() == 6) {
         h0 = INPUT_VARIABLE(4);
         maxTimeStep = INPUT_VARIABLE(5);
-    }    
-    
+    }
+
     auto h      =  OUTPUT_VARIABLE(0);           // cell outputs [time x bS x numUnits]
     auto hFinal =  OUTPUT_VARIABLE(1);           // at the end it will store cell final non-zero output [bS x numUnits]
 
 	REQUIRE_TRUE(x->rankOf() == 3, 0, "STATIC_RNN custom operation: input array x must have rank = 3, but got %i instead !", x->rankOf());
-    REQUIRE_TRUE(Wx->rankOf() == 2, 0, "STATIC_RNN custom operation: input-to-hidden weights array must have rank = 2, but got %i instead !", Wx->rankOf());    
+    REQUIRE_TRUE(Wx->rankOf() == 2, 0, "STATIC_RNN custom operation: input-to-hidden weights array must have rank = 2, but got %i instead !", Wx->rankOf());
 
     const int time     = x->sizeAt(0);
     const int bS       = x->sizeAt(1);
@@ -67,25 +67,25 @@ CUSTOM_OP_IMPL(static_rnn, 4, 2, false, 0, 0) {
 
 
     helpers::rnnTimeLoop(block.launchContext(), x, Wx, Wh, b, h0, maxTimeStep,  h, hFinal);
-    
+
     return Status::OK();
 }
 
-        DECLARE_TYPES(static_rnn) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes({ALL_FLOATS});
-        }
+DECLARE_TYPES(static_rnn) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedOutputTypes({ALL_FLOATS});
+}
 
 
-DECLARE_SHAPE_FN(static_rnn) {    
+DECLARE_SHAPE_FN(static_rnn) {
 
     auto xShapeInfo  = inputShape->at(0);               // input [time x bS x inSize]
-    auto WxShapeInfo = inputShape->at(1);               // input-to-hidden  weights, [inSize  x numUnits]     
-    auto WhShapeInfo = inputShape->at(2);               // hidden-to-hidden weights, [numUnits x numUnits]         
-    auto bShapeInfo  = inputShape->at(3);               // biases for, [2*numUnits] 
+    auto WxShapeInfo = inputShape->at(1);               // input-to-hidden  weights, [inSize  x numUnits]
+    auto WhShapeInfo = inputShape->at(2);               // hidden-to-hidden weights, [numUnits x numUnits]
+    auto bShapeInfo  = inputShape->at(3);               // biases for, [2*numUnits]
 
-    Nd4jLong* h0ShapeInfo          = nullptr;                // initial cell output (at time step = 0) [bS x numUnits] 
+    Nd4jLong* h0ShapeInfo          = nullptr;                // initial cell output (at time step = 0) [bS x numUnits]
     Nd4jLong* maxTimeStepShapeInfo = nullptr;                // vector [bS] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this means there are no calculations for time >= maxTimeStep
 
     if(block.width() == 5) {
@@ -97,10 +97,10 @@ DECLARE_SHAPE_FN(static_rnn) {
     else if(block.width() == 6) {
         h0ShapeInfo = inputShape->at(4);
         maxTimeStepShapeInfo = inputShape->at(5);
-    }    
+    }
 
     REQUIRE_TRUE(xShapeInfo[0]  == 3, 0, "STATIC_RNN custom operation: input array x must have rank = 3, but got %i instead !", xShapeInfo[0]);
-    REQUIRE_TRUE(WxShapeInfo[0] == 2, 0, "STATIC_RNN custom operation: input-to-hidden weights array must have rank = 2, but got %i instead !", WxShapeInfo[0]);    
+    REQUIRE_TRUE(WxShapeInfo[0] == 2, 0, "STATIC_RNN custom operation: input-to-hidden weights array must have rank = 2, but got %i instead !", WxShapeInfo[0]);
 
     const int inRank   = xShapeInfo[0];
     const int time     = xShapeInfo[1];
@@ -118,7 +118,7 @@ DECLARE_SHAPE_FN(static_rnn) {
     Nd4jLong *hShapeInfo(nullptr), *hPrevShapeInfo(nullptr);
     ALLOCATE(hShapeInfo,     block.getWorkspace(), shape::shapeInfoLength(inRank), Nd4jLong);
     ALLOCATE(hPrevShapeInfo, block.getWorkspace(), shape::shapeInfoLength(inRank-1), Nd4jLong);
-            
+
     hShapeInfo[0]            = inRank;
     hPrevShapeInfo[0]        = inRank-1;
     hShapeInfo[1]            = time;
@@ -127,9 +127,9 @@ DECLARE_SHAPE_FN(static_rnn) {
 
     ShapeUtils::updateStridesAndType(hShapeInfo, xShapeInfo, shape::order(xShapeInfo));
     ShapeUtils::updateStridesAndType(hPrevShapeInfo, xShapeInfo, shape::order(xShapeInfo));
-         
+
     return SHAPELIST(CONSTANT(hShapeInfo), CONSTANT(hPrevShapeInfo));
-}   
+}
 
 
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp b/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp
index 10db21c2e..247e454bd 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/scatter_update.cpp
@@ -39,7 +39,7 @@ namespace nd4j {
          * @tparam T
          */
         CONFIGURABLE_OP_IMPL(scatter_update, 2, 1, true, 0, -1) {
-            
+
             auto operand = INPUT_VARIABLE(0);
             auto updates = INPUT_VARIABLE(1);
 
diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index e9b9d1ff6..a32f7d1c3 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -30,7 +30,7 @@ namespace nd4j {
          * Expected input:
          * 0: N-dimensional array
          * 1: optional axis vector
-         * 
+         *
          * Int args:
          * 0: optional axis
          */
@@ -43,7 +43,7 @@ namespace nd4j {
          * Expected input:
          * 0: N-dimensional array
          * 1: optional axis vector
-         * 
+         *
          * Int args:
          * 0: optional axis
          */
@@ -52,21 +52,21 @@ namespace nd4j {
         #endif
 
         /**
-         * This operation provides various normalization modes: 
+         * This operation provides various normalization modes:
          * 0: frobenius
          * 1: euclidean (norm2)
          * 2: norm1
          * 3: norm2
          * 4: inf-norm
          * 5: p-norm
-         * 
+         *
          * Expected arguments:
          * input: N-dimensional array
-         * 
-         * 
+         *
+         *
          * Int args:
          * 0...: axis
-         * 
+         *
          * T args:
          * 0: norm mode
          * 1: p for p-norm
@@ -159,7 +159,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_add)
         DECLARE_OP(scatter_add, 3, 1, true);
@@ -170,7 +170,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_sub)
         DECLARE_OP(scatter_sub, 3, 1, true);
@@ -181,7 +181,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_mul)
         DECLARE_OP(scatter_mul, 3, 1, true);
@@ -192,7 +192,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_div)
         DECLARE_OP(scatter_div, 3, 1, true);
@@ -203,7 +203,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_upd)
         DECLARE_OP(scatter_upd, 3, 1, true);
@@ -214,7 +214,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_max)
         DECLARE_OP(scatter_max, 3, 1, true);
@@ -225,7 +225,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing indexes for first dimension of input
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_min)
         DECLARE_OP(scatter_min, 3, 1, true);
@@ -233,7 +233,7 @@ namespace nd4j {
 
         /**
          * This operation scatter "updates" elements into new output array according to given "indices"
-         * Expected arguments:         
+         * Expected arguments:
          * indices: array containing elements/slices indexes of output array to put "updates" elements into, the rest output elements will be zeros
          * updates: array containing elements to be inserted into output array
          * shape: contains shape of output array
@@ -244,8 +244,8 @@ namespace nd4j {
 
         /**
          * This operation scatter "updates" elements into input array along given "indices"
-         * Expected arguments:   
-         * input: array to be updated      
+         * Expected arguments:
+         * input: array to be updated
          * indices: array containing elements/slices indexes of input array to put "updates" elements into
          * updates: array containing elements to be inserted into input array
          */
@@ -258,7 +258,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing elements/slices indexes of input array to add "updates" elements to
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_add)
         DECLARE_OP(scatter_nd_add, 3, 1, true);
@@ -269,7 +269,7 @@ namespace nd4j {
          * Expected arguments:
          * input: array to be updated
          * indices: array containing elements/slices indexes of input array to subtract "updates" elements from
-         * updates: array containing elements to be interfered with input 
+         * updates: array containing elements to be interfered with input
          */
         #if NOT_EXCLUDED(OP_scatter_sub)
         DECLARE_OP(scatter_nd_sub, 3, 1, true);
@@ -279,7 +279,7 @@ namespace nd4j {
          * This operation takes input's shape, and returns new NDArray filled with specified value
          * Expected arguments:
          * input: N-dimensional array
-         * 
+         *
          * T args:
          * 0: scalar value, used to fill NDArray
          */
@@ -305,7 +305,7 @@ namespace nd4j {
 
         /**
          * This operation returns 3 1D arrays for given 1D array with unique element count and indexes
-         * input: 
+         * input:
          *     0 - 1D array
          *
          * output:
@@ -321,7 +321,7 @@ namespace nd4j {
          * This operation splits input NDArray into multiple TADs along given dimensions
          * Expected arguments:
          * input: N-dimensional array
-         * 
+         *
          * Int args:
          * 0..: TAD axis
          */
@@ -338,7 +338,7 @@ namespace nd4j {
         #endif
 
         /**
-         * This operation extracts a strided (optionally) slice from a tensor, 
+         * This operation extracts a strided (optionally) slice from a tensor,
          */
         #if NOT_EXCLUDED(OP_strided_slice)
         DECLARE_CUSTOM_OP(strided_slice, 1, 1, false, 0, 5); // TODO: new op type needed. that returns VIEW
@@ -347,7 +347,7 @@ namespace nd4j {
 
         /**
          * This operation extracts a slice from a tensor.
-         * 
+         *
          */
         #if NOT_EXCLUDED(OP_slice)
         DECLARE_CUSTOM_OP(slice, 1, 1, false, 0, -2);
@@ -360,12 +360,12 @@ namespace nd4j {
          * start: optional scalar with starting value
          * stop: optional scalar with end value
          * step: optional scalar witn step value
-         * 
+         *
          * Int args: (optional)
          * 0: optional scalar with starting value
          * 1: optional scalar with end value
          * 1: optional scalar witn step value
-         * 
+         *
          * T args: (optional)
          * 0: optional scalar with starting value
          * 1: optional scalar with end value
@@ -379,11 +379,11 @@ namespace nd4j {
          * This operation return one-hot encoded n-dimensional array
          * Expected arguments:
          * input: N-dimensional array
-         * 
+         *
          * T args:
          * 0: 'on' value
          * 1: 'off' value
-         * 
+         *
          * Int args:
          * 0: depth
          * 1: axis
@@ -413,7 +413,7 @@ namespace nd4j {
 		 * This operation stacks a list of rank tensors into one rank-(R+1) tensor.
 		 * Expected arguments:
 		 * 0...: N-Dimensional arrays to stack
-		 * 
+		 *
 		 */
         #if NOT_EXCLUDED(OP_stack)
         DECLARE_CUSTOM_OP(stack, -1, 1, false, 0, 0);
@@ -423,7 +423,7 @@ namespace nd4j {
          * This operation returns length of input array
          * Expected arguments:
          * input: N-dimensional array
-         * 
+         *
          * TODO: make this operation reduction, to allow TAD -> size
          */
         #if NOT_EXCLUDED(OP_size)
@@ -447,7 +447,7 @@ namespace nd4j {
          * This operation takes input's shape, and returns new NDArray filled with zeros
          * Expected arguments:
          * input: N-dimensional array
-         * 
+         *
          */
         #if NOT_EXCLUDED(OP_zeros_as)
         DECLARE_OP(zeros_as, 1, 1, false);
@@ -457,7 +457,7 @@ namespace nd4j {
          * This operation takes input's shape, and returns new NDArray filled with ones
          * Expected arguments:
          * input: N-dimensional array
-         * 
+         *
          */
         #if NOT_EXCLUDED(OP_ones_as)
         DECLARE_OP(ones_as, 1, 1, false);
@@ -512,10 +512,10 @@ namespace nd4j {
          * Input arrays:
          * 0 - shape vector
          * 1 - optional scalar NDArray
-         * 
+         *
          * T arguments:
          * 0 - optional scalar value
-         * 
+         *
          */
         #if NOT_EXCLUDED(OP_fill)
         DECLARE_CUSTOM_OP(fill, 1, 1, false, -2, 0);
@@ -527,10 +527,10 @@ namespace nd4j {
          * 0 - input array
          * 1 - array of sizes
          * 2 - optional axis
-         * 
+         *
          * Integer arguments:
          * 0 - optional axis
-         * 
+         *
          */
         #if NOT_EXCLUDED(OP_split_v)
         DECLARE_CUSTOM_OP(split_v, 2, -1, false, 0, -2);
@@ -540,7 +540,7 @@ namespace nd4j {
          * This operation splits given NDArray into chunks of specific size, along given dimension
          * 0 - input array
          * 1 - optional axis
-         * 
+         *
          * Integer arguments:
          * 0 - number of splits
          * 1 - optional axis
@@ -553,39 +553,37 @@ namespace nd4j {
         /**
          * This operation adjusts image hue by delta
          * Input arrays:
-         * 0 - 1D or 3D input array, must have 3 channels.
-         * 1 - optional scalar, delta value
-         * 
+         * 0 - input array with rank >= 3, must have at least one dimension equal 3, that is dimension containing channels.
+         *
          * T arguments:
-         * 0 - optional delta value
-         * 
+         * 0 - delta value
+         *
          * Int arguments:
-         * 0 - optional argument, isNHWC. false by default.
+         * 0 - optional argument, corresponds to dimension with 3 channels
          */
         #if NOT_EXCLUDED(OP_adjust_hue)
-        DECLARE_CONFIGURABLE_OP(adjust_hue, 1, 1, true, -2, -2);
+        DECLARE_CONFIGURABLE_OP(adjust_hue, 1, 1, true, 1, -2);
         #endif
 
         /**
          * This operation adjusts image saturation by delta
          * Input arrays:
-         * 0 - 1D or 3D input array, must have 3 channels.
-         * 1 - optional scalar, delta value
-         * 
+         * 0 - input array with rank >= 3, must have at least one dimension equal 3, that is dimension containing channels.
+         *
          * T arguments:
-         * 0 - optional delta value
-         * 
+         * 0 - saturation factor
+         *
          * Int arguments:
-         * 0 - optional argument, isNHWC. false by default.
+         * 0 - optional argument, corresponds to dimension with 3 channels
          */
         #if NOT_EXCLUDED(OP_adjust_saturation)
-        DECLARE_CONFIGURABLE_OP(adjust_saturation, 1, 1, true, -2, -2);
+        DECLARE_CONFIGURABLE_OP(adjust_saturation, 1, 1, true, 1, -2);
         #endif
 
 
         /**
-         * 
-         * 
+         *
+         *
          *
          */
         #if NOT_EXCLUDED(OP_depth_to_space)
@@ -593,8 +591,8 @@ namespace nd4j {
         #endif
 
         /**
-         * 
-         * 
+         *
+         *
          *
          */
         #if NOT_EXCLUDED(OP_space_to_depth)
@@ -612,23 +610,23 @@ namespace nd4j {
         #endif
 
         /**
-         * 
-         * 
+         *
+         *
          */
         #if NOT_EXCLUDED(OP_space_to_batch)
         DECLARE_CUSTOM_OP(space_to_batch, 1, 1, false, 0, -2);
         #endif
 
         /**
-         * 
-         * 
+         *
+         *
          */
         #if NOT_EXCLUDED(OP_batch_to_space)
         DECLARE_CUSTOM_OP(batch_to_space, 1, 1, false, 0, -2);
         #endif
 
         /**
-         * top_k operation returns a vector of k top values for 
+         * top_k operation returns a vector of k top values for
          *  given NDArray as tensor with default boolean (true)
          *  as sort for result index array
          *  will be sorted by the values in descending order.
@@ -641,7 +639,7 @@ namespace nd4j {
         #endif
 
         /**
-         * in_top_k operation returns a vector of k boolean values for 
+         * in_top_k operation returns a vector of k boolean values for
          *  given NDArray as 2D matrix of predicted in the NDArray k top values
          *  The first parameter is a NDArray of predicted values (2d array).
          *  The second is NDArray as vector of indeces k top values will be search.
@@ -655,7 +653,7 @@ namespace nd4j {
          * moments operation calculate a mean and variation for given NDArray
          * with reduce a result according to axis array given.
          * For full axis the result is both mean and variance of all members in array.
-         * Otherwise there are two NDArrays with means and variances for 
+         * Otherwise there are two NDArrays with means and variances for
          * Axes can be put as the second NDArray or as int vector.
          *
          * the optional flag "keep_dims" can be set as T param
@@ -673,13 +671,13 @@ namespace nd4j {
         #endif
 
         /**
-         * dynamic_partition - partition a input tensor onto num_partitions 
+         * dynamic_partition - partition a input tensor onto num_partitions
          * accordingly to index array given.
          *
          * the first param - NDArray to be partitioned.
          * the second param - index array
          * the third param (integer param) - num or partitions.
-         * 
+         *
          * returns a num of NDArrays as output
          */
         #if NOT_EXCLUDED(OP_dynamic_partition)
@@ -691,14 +689,14 @@ namespace nd4j {
         #endif
 
         /**
-         * dynamic_stitch - merge partitions from the second param a input tensor 
+         * dynamic_stitch - merge partitions from the second param a input tensor
          * into a single tensor accordingly to index array given.
          *
          * the first param - index array
          * the second params - tensors to be merged
-         * 
+         *
          * returns a num of NDArrays as output
-         * 
+         *
          * the operation is inversion od dynamic_partition
          */
         #if NOT_EXCLUDED(OP_dynamic_stitch)
@@ -744,7 +742,7 @@ namespace nd4j {
         #endif
 
         /**
-         * normalize_moments operation normalize already calculated mean and variation 
+         * normalize_moments operation normalize already calculated mean and variation
          * accordingly to shift and count.
          * input params:
          *  - count of data
@@ -752,7 +750,7 @@ namespace nd4j {
          *  - tensor with variance (the same shape as before)
          *
          *  - optional floating point param shift.
-         * 
+         *
          *  returns a normalized pair mean and variance with the same shapes as input
          */
         #if NOT_EXCLUDED(OP_normalize_moments)
@@ -766,8 +764,8 @@ namespace nd4j {
          * input params:
          *  - input tensor
          *  - axes vector
-         *  
-         * 
+         *
+         *
          *  - optional floating point param shift.
          *  - optional int (as bool) keep_dimension
          *
@@ -787,7 +785,7 @@ namespace nd4j {
          *  0 - target
          *  1 - input
          *  2 - weights (scalar or vector with same as last dimension)
-         *  
+         *
          *  return value - a tensor with the same shape as target or input
          */
         #if NOT_EXCLUDED(OP_weighted_cross_entropy_with_logits)
@@ -799,7 +797,7 @@ namespace nd4j {
          * Input arguments
          *  0 - input tensor
          *  1 - noise_shape - (vector with shape to reduce) - optional
-         *  
+         *
          *  int parameter - seed for random numbers
          *  T parameter - probability (should be between 0 and 1)
          *  return value - a tensor with the same shape as target or input
@@ -825,13 +823,13 @@ namespace nd4j {
 
         /**
          * bincount operation return a vector with element counted.
-         * 
+         *
          * input params:
          *  - input tensor - only int part are accepted
          *  - weights - the same shape tensor with integer weights for element (optional)
          *  default weight - 1,1,1..,1 for all values in the tensor
-         * 
-         *  optional ints: 
+         *
+         *  optional ints:
          *  - min_length - zero or greater
          *  - max_length - between min_length and max(input) + 1
          *
@@ -852,7 +850,7 @@ namespace nd4j {
          *    1 - the second shape (vector with shape)
          *
          * return value:
-         *    vector with broadcasted shape 
+         *    vector with broadcasted shape
          */
         #if NOT_EXCLUDED(OP_broadcast_dynamic_shape)
         DECLARE_CUSTOM_OP(broadcast_dynamic_shape, 2, 1, false, 0, 0);
@@ -865,7 +863,7 @@ namespace nd4j {
          *    0 - the tensor with dimension (x * y * z * ::: * M * M)
          *
          * return value:
-         *    tensor with dimension (x * y * z * ::: *) with determinant for all 
+         *    tensor with dimension (x * y * z * ::: *) with determinant for all
          * M x M matricies
          */
         #if NOT_EXCLUDED(OP_matrix_determinant)
@@ -901,7 +899,7 @@ namespace nd4j {
         #if NOT_EXCLUDED(OP_logdet)
         DECLARE_CUSTOM_OP(logdet, 1, 1, false, 0, 0);
         #endif
-        
+
         /**
          * matrix_inverse op. - make inverse for all 2D square matricies found in the input tensor
          *
@@ -909,7 +907,7 @@ namespace nd4j {
          *    0 - the tensor with dimension (x * y * z * ::: * M * M)
          *
          * return value:
-         *    tensor with dimension (x * y * z * ::: * M * M) with inverse M x M matricies in it 
+         *    tensor with dimension (x * y * z * ::: * M * M) with inverse M x M matricies in it
          */
         #if NOT_EXCLUDED(OP_matrix_inverse)
         DECLARE_OP(matrix_inverse, 1, 1, true);
@@ -1151,16 +1149,16 @@ namespace nd4j {
 
         /**
          * roll - op porting from numpy (https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.roll.html)
-         * 
+         *
          * input params:
          *    0 - NDArray
-         * 
+         *
          * int params:
          *    0 - shift
          *    1 - axe 1
          *    2 - axe 2
          *    ...
-         *    N - axe N 
+         *    N - axe N
          *
          *    All axes are optional and should be between 0 and input->rankOf(). Of course, all axes can be repeated.
          *
@@ -1173,12 +1171,12 @@ namespace nd4j {
 
         /**
          * lin_space - op porting from TF (https://www.tensorflow.org/api_docs/python/tf/lin_space)
-         * 
+         *
          * input params:
          *    0 - startVal - NDArray scalar (float point)
          *    1 - finishVal - NDArray scalar (float point)
          *    2 - numOfElements - NDArray scalar (integer)
-         * 
+         *
          * output:
          *    0 - 1D NDArray with the same type as input and length as given with numOfElements param.
          */
@@ -1188,10 +1186,10 @@ namespace nd4j {
 
         /**
          * reduction_sum - tf.reduction_sum operation
-         * 
+         *
          * input params:
          *    0 - NDArray
-         * 
+         *
          * T_ARG param (optional):
          * 0 - keep_dims != 0.
          *
@@ -1199,7 +1197,7 @@ namespace nd4j {
          *    0 - axe 1
          *    1 - axe 2
          *    ...
-         *    N-1 axe N 
+         *    N-1 axe N
          *
          *    All axes are optional and should be between 0 and input->rankOf() - 1
          *
@@ -1216,10 +1214,10 @@ namespace nd4j {
 
         /**
          * reduction_prod - tf.reduction_prod operation
-         * 
+         *
          * input params:
          *    0 - NDArray
-         * 
+         *
          * T_ARG param (optional):
          * 0 - keep_dims != 0.
          *
@@ -1227,7 +1225,7 @@ namespace nd4j {
          *    0 - axe 1
          *    1 - axe 2
          *    ...
-         *    N-1 axe N 
+         *    N-1 axe N
          *
          *    All axes are optional and should be between 0 and input->rankOf() - 1
          *
@@ -1246,7 +1244,7 @@ namespace nd4j {
         * This op calculates min of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate mins for        
+        *    x: tensor to calculate mins for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1268,7 +1266,7 @@ namespace nd4j {
         * This op calculates max of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate maxes for        
+        *    x: tensor to calculate maxes for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1290,7 +1288,7 @@ namespace nd4j {
         * This op calculates norm1 of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate norm1 for        
+        *    x: tensor to calculate norm1 for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1312,7 +1310,7 @@ namespace nd4j {
         * This op calculates norm2 of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate norm2 for        
+        *    x: tensor to calculate norm2 for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1335,7 +1333,7 @@ namespace nd4j {
         * This op calculates squared norm of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate squared norm for        
+        *    x: tensor to calculate squared norm for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1357,7 +1355,7 @@ namespace nd4j {
         * This op calculates norm max of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate norm max for        
+        *    x: tensor to calculate norm max for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1379,7 +1377,7 @@ namespace nd4j {
         * This op calculates mean of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate mean for        
+        *    x: tensor to calculate mean for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1401,7 +1399,7 @@ namespace nd4j {
         * This op calculates sample variance of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate mean for        
+        *    x: tensor to calculate mean for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1420,7 +1418,7 @@ namespace nd4j {
         * This op calculates sample standard deviation of elements along given dimensions
         *
         * input array:
-        *    x: tensor to calculate mean for        
+        *    x: tensor to calculate mean for
         *
         * float arguments:
         *   keepDims: if non zero, then keep reduced dimensions with length = 1, default value is zero
@@ -1439,13 +1437,13 @@ namespace nd4j {
         * This op calculates backprop dot for two tensors along given dimensions
         *
         * input array:
-        *    x: tensor to calculate dot for        
-        *    y: tensor to calculate dot for        
+        *    x: tensor to calculate dot for
+        *    y: tensor to calculate dot for
         *    z: tensor with gradient output of the FF dot for x and y
         *
         * int arguments:
-        *   list of integers - dimensions to calculate dot along, 
-        *   default corresponds to empty list in which case calculation 
+        *   list of integers - dimensions to calculate dot along,
+        *   default corresponds to empty list in which case calculation
         *   is performed for all dimensions and scalar is returned.
         *
         * output array:
@@ -1458,7 +1456,7 @@ namespace nd4j {
         #endif
         /**
          * reduce_logsumexp - tf.reduce_logsumexe operation
-         * 
+         *
          * input params:
          *    0 - NDArray (input)
          *    1 - 1D NDArray (axis) (optional) - integer array
@@ -1470,7 +1468,7 @@ namespace nd4j {
          *    0 - axe 1
          *    1 - axe 2
          *    ...
-         *    N-1 axe N 
+         *    N-1 axe N
          *
          *  CAUTION: All axes are optional and should be between 0 and input->rankOf() - 1
          *  and put either with second param or as integers but not both
@@ -1619,7 +1617,7 @@ namespace nd4j {
         #endif
 /**
          * fake_quant_with_min_max_vals - tf.quantization.fake_quant_with_min_max_vars
-         * 
+         *
          * input params:
          *    0 - NDArray (input)
          *    1 - 0D Tensor - min value
diff --git a/libnd4j/include/ops/declarable/headers/recurrent.h b/libnd4j/include/ops/declarable/headers/recurrent.h
index be6a43c85..4b2eddc57 100644
--- a/libnd4j/include/ops/declarable/headers/recurrent.h
+++ b/libnd4j/include/ops/declarable/headers/recurrent.h
@@ -24,7 +24,7 @@
 #include <ops/declarable/headers/common.h>
 
 namespace nd4j {
-namespace ops  { 
+namespace ops  {
 
     //////////////////////////////////////////////////////////////////////////
     /**
@@ -48,15 +48,15 @@ namespace ops  {
     //////////////////////////////////////////////////////////////////////////
     /**
        * Implementation of operation for Simple Recurrent Unit (bidirectional case): "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
-       * 
-       * Input arrays: 
+       *
+       * Input arrays:
        *    0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
        *    1: 2d tensor of weights [2K x 6K]
        *    2: row of biases with twice length [1 x 4K]
        *    3: 2d tensor of previous cell state [bS x 2K]
        *    4: optional, 2d tensor of dropout mask [bS x 2K]
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: 3d tensor of cell output [N x bS x 2K]
        *    1: 3d tensor of cell state [N x bS x 2K]
        */
@@ -68,8 +68,8 @@ namespace ops  {
     //////////////////////////////////////////////////////////////////////////
     /**
        * Implementation of operation for back propagation in Simple Recurrent Unit: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
-       * 
-       * Input arrays: 
+       *
+       * Input arrays:
        *    0: input 3d tensor with shape [bS x K x N], N - number of time steps, bS - batch size, K - number of features
        *    1: 2d tensor of weights [3K x K]
        *    2: row of biases with twice length [1 x 2K]
@@ -78,8 +78,8 @@ namespace ops  {
        *    5: 2d tensor of cell state gradients [bS x K]
        *    6: 3d tensor of state output gradients [bS x K x N]
        *    7: optional, 2d tensor of dropout mask [bS x K]
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: 3d tensor of input gradients [bS x K x N]
        *    1: 3d tensor of weights gradients [bS x 3K x K]
        *    2: 2d, row of biases gradients [1 x 2K]
@@ -92,8 +92,8 @@ namespace ops  {
     //////////////////////////////////////////////////////////////////////////
     /**
        * Implementation of operation for back propagation in Simple Recurrent Unit (bidirectional case): "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
-       * 
-       * Input arrays: 
+       *
+       * Input arrays:
        *    0: input 3d tensor with shape [N x bS x 2K], N - number of time steps, bS - batch size, K - number of features
        *    1: 2d tensor of weights [2K x 6K]
        *    2: row of biases with twice length [1 x 4K]
@@ -102,13 +102,13 @@ namespace ops  {
        *    5: 2d tensor of cell state gradients [bS x 2K]
        *    6: 3d tensor of state output gradients [N x bS x 2K]
        *    7: optional, 2d tensor of dropout mask [bS x 2K]
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: 3d tensor of input gradients [N x bS x 2K]
        *    1: 3d tensor of weights gradients [N x 2K x 6K]
        *    2: 2d, row of biases gradients [1 x 4K]
        *    3: 2d, tensor of state gradients [bS x 2K]
-       */                  
+       */
         #if NOT_EXCLUDED(OP_sru_bi)
         DECLARE_CUSTOM_OP(sru_bi_bp,   8, 4, true,  0, 0);
         #endif
@@ -118,20 +118,20 @@ namespace ops  {
     /**
        * Implementation of operation for LSTM cell with peep hole connections:
        *    S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural Computation
-       *    and 
+       *    and
        *    https://research.google.com/pubs/archive/43905.pdf
-       *    Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014. 
+       *    Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014.
        *
-       * Input arrays: 
+       * Input arrays:
        *    0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
-       *    1: previous cell output [batchSize x numProj],  that is at previous time step t-1, in case of projection=false -> numProj=numUnits!!! 
-       *    2: previous cell state  [batchSize x numUnits], that is at previous time step t-1   
-       *    3: input-to-hidden  weights, [inSize  x 4*numUnits] 
-       *    4: hidden-to-hidden weights, [numProj x 4*numUnits] 
-       *    5: diagonal weights for peephole connections [3*numUnits] 
-       *    6: projection weights [numUnits x numProj] 
-       *    7: biases, [4*numUnits] 
-       * 
+       *    1: previous cell output [batchSize x numProj],  that is at previous time step t-1, in case of projection=false -> numProj=numUnits!!!
+       *    2: previous cell state  [batchSize x numUnits], that is at previous time step t-1
+       *    3: input-to-hidden  weights, [inSize  x 4*numUnits]
+       *    4: hidden-to-hidden weights, [numProj x 4*numUnits]
+       *    5: diagonal weights for peephole connections [3*numUnits]
+       *    6: projection weights [numUnits x numProj]
+       *    7: biases, [4*numUnits]
+       *
        *  Input integer arguments:
        *    0: if not zero, provide peephole connections
        *    1: if not zero, then projection is performed, if zero then numProj==numUnits is mandatory!
@@ -140,11 +140,11 @@ namespace ops  {
        *    0: clipping value for cell state, if it is not equal to zero, then cell state is clipped
        *    1: clipping value for projected cell output, if it is not equal to zero, then projected cell output is clipped
        *    2: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: current cell output [batchSize x numProj], that is at current time step t
        *    1: current cell state  [batchSize x numUnits], that is at current time step t
-       */                  
+       */
         #if NOT_EXCLUDED(OP_lstmCell)
         DECLARE_CUSTOM_OP(lstmCell, 8, 2, false, 3, 2);
         #endif
@@ -154,12 +154,12 @@ namespace ops  {
     /**
        * Implementation of operation for LSTM cell with optional peep hole connections:
        *    S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural Computation
-       *    and 
+       *    and
        *    https://research.google.com/pubs/archive/43905.pdf
        *    Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory recurrent neural network architectures for large scale acoustic modeling." INTERSPEECH, 2014.
 	   * See also: https://arxiv.org/pdf/1503.04069.pdf
        *
-       * Input arrays: 
+       * Input arrays:
        *    0: input [bS, inSize] at time t
        *    1: previous cell state  [bS, numUnits], time t-1
        *    2: previous output [bS, numUnits], time t-1
@@ -168,15 +168,15 @@ namespace ops  {
        *    5: weights - cell peephole (t-1) connections to forget gate, [numUnits]
        *    6: weights - cell peephole (t) connections to output gate, [numUnits]
        *    7: biases, shape [4*numUnits]
-       * 
+       *
        *  Input integer arguments:
        *    0: if not zero, provide peephole connections
        *
        *  Input float arguments:
        *    0: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
 	   *    1: clipping value for cell state, if it is not equal to zero, then cell state is clipped
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: i      - Input modulation gate activations [bS, numUnits]
        *    1: c (cs) - Cell state (pre tanh) [bs, numUnits] (cs)
        *    2: f      - Output - forget gate activations [bs, numUnits]
@@ -184,7 +184,7 @@ namespace ops  {
        *    4: z (ci) - Output - block input [bs, numUnits]
        *    5: h (co) - Cell state, post tanh [bs, numUnits]
        *    6: y (h)  - Current cell output [bS, numUnits], time t
-       */                  
+       */
         #if NOT_EXCLUDED(OP_lstmBlockCell)
         DECLARE_CUSTOM_OP(lstmBlockCell, 8, 7, false, 2, 1);
         #endif
@@ -230,21 +230,21 @@ namespace ops  {
         #if NOT_EXCLUDED(OP_lstmBlock)
         DECLARE_CUSTOM_OP(lstmBlock, 9, 7, false, 2, 2);
         #endif
-		
+
     //////////////////////////////////////////////////////////////////////////
     /**
        * Implementation of operations for Simple Recurrent Unit cell: "Training RNNs as Fast as CNNs" Tao Lei, Yu Zhang, Yoav Artzi
        *
-       * Input arrays: 
+       * Input arrays:
        *    0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
        *    1: previous cell state [batchSize x inSize], that is at previous time step t-1
        *    2: weights [inSize x 3*inSize]
        *    3: biases [1 x 2*inSize]
-       * 
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: current cell output [batchSize x inSize], that is at current time step t
        *    1: current cell state  [batchSize x inSize], that is at current time step t
-       */                  
+       */
         #if NOT_EXCLUDED(OP_sruCell)
         DECLARE_CUSTOM_OP(sruCell, 4, 2, false, 0, 0);
         #endif
@@ -253,45 +253,45 @@ namespace ops  {
     //////////////////////////////////////////////////////////////////////////
     /**
        * Implementation of gated Recurrent Unit cell:
-       *    Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio       
+       *    Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio
        *    "Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation"
        *
-       * Input arrays: 
+       * Input arrays:
        *    0: input with shape [batchSize x inSize], batchSize - batch size, inSize - number of features
        *    1: previous cell output [batchSize x numUnits],  that is at previous time step t-1
-       *    2: RU weights - [(nIn+nOut), 2*numUnits] - reset and update gates (input/recurrent weights)
-       *    3: C weights - [(nIn+nOut), numUnits] - cell gate (input/recurrent weights)
+       *    2: RU weights - [(inSize+numUnits), 2*numUnits] - reset and update gates (input/recurrent weights)
+       *    3: C weights - [(inSize+numUnits), numUnits] - cell gate (input/recurrent weights)
        *    4: reset and update biases, [2*numUnits] - reset and update gates
        *    5: cell biases, [numUnits]
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: Reset gate output [bS, numUnits]
        *    1: Update gate output [bS, numUnits]
        *    2: Cell gate output [bS, numUnits]
        *    3: Current cell output [bS, numUnits]
-       */                  
+       */
         #if NOT_EXCLUDED(OP_gruCell)
         DECLARE_CUSTOM_OP(gruCell, 6, 4, false, 0, 0);
         #endif
 
         #if NOT_EXCLUDED(OP_gruCell)
-        DECLARE_CUSTOM_OP(gruCell_bp, 6, 5, false, 0, 0);
+        DECLARE_CUSTOM_OP(gruCell_bp, 10, 6, false, 0, 0);
         #endif
 
     //////////////////////////////////////////////////////////////////////////
     /**
        * Implementation of operation "LSTM time sequences" with peep hole connections:
        *
-       * Input arrays: 
+       * Input arrays:
        *    0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
-       *    1: initial cell output [batchSize x numProj],  that is at time step = 0, in case of projection=false -> numProj=numUnits!!! 
-       *    2: initial cell state  [batchSize x numUnits], that is at time step = 0   
-       *    3: input-to-hidden  weights, [inSize  x 4*numUnits] 
-       *    4: hidden-to-hidden weights, [numProj x 4*numUnits] 
-       *    5: diagonal weights for peephole connections [3*numUnits] 
-       *    6: projection weights [numUnits x numProj] 
-       *    7: biases, [4*numUnits] 
-       * 
+       *    1: initial cell output [batchSize x numProj],  that is at time step = 0, in case of projection=false -> numProj=numUnits!!!
+       *    2: initial cell state  [batchSize x numUnits], that is at time step = 0
+       *    3: input-to-hidden  weights, [inSize  x 4*numUnits]
+       *    4: hidden-to-hidden weights, [numProj x 4*numUnits]
+       *    5: diagonal weights for peephole connections [3*numUnits]
+       *    6: projection weights [numUnits x numProj]
+       *    7: biases, [4*numUnits]
+       *
        *  Input integer arguments:
        *    0: if not zero, provide peephole connections
        *    1: if not zero, then projection is performed, if zero then numProj==numUnits is mandatory!
@@ -300,11 +300,11 @@ namespace ops  {
        *    0: clipping value for cell state, if it is not equal to zero, then cell state is clipped
        *    1: clipping value for projected cell output, if it is not equal to zero, then projected cell output is clipped
        *    2: the bias added to forget gates in order to reduce the scale of forgetting in the beginning of the training
-       *  
-       * Output arrays: 
+       *
+       * Output arrays:
        *    0: cell outputs [time x batchSize x numProj], that is per each time step
        *    1: cell states  [time x batchSize x numUnits], that is per each time step
-       */                  
+       */
         #if NOT_EXCLUDED(OP_lstm)
         DECLARE_CUSTOM_OP(lstm, 8, 2, false, 3, 2);
         #endif
@@ -313,16 +313,16 @@ namespace ops  {
     /**
        * Implementation of gated Recurrent Unit:
        *
-       * Input arrays: 
+       * Input arrays:
        *    0: input with shape [time x batchSize x inSize], time - number of time steps, batchSize - batch size, inSize - number of features
        *    1: initial cell output [batchSize x numUnits],  that is at time step = 0
-       *    2: input-to-hidden  weights, [inSize   x 3*numUnits] 
-       *    3: hidden-to-hidden weights, [numUnits x 3*numUnits] 
-       *    4: biases, [3*numUnits]        
-       *  
-       * Output arrays: 
-       *    0: cell outputs [time x batchSize x numUnits], that is per each time step    
-       */                  
+       *    2: input-to-hidden  weights, [inSize   x 3*numUnits]
+       *    3: hidden-to-hidden weights, [numUnits x 3*numUnits]
+       *    4: biases, [3*numUnits]
+       *
+       * Output arrays:
+       *    0: cell outputs [time x batchSize x numUnits], that is per each time step
+       */
         #if NOT_EXCLUDED(OP_gru)
         DECLARE_CUSTOM_OP(gru, 5, 1, false, 0, 0);
         #endif
diff --git a/libnd4j/include/ops/declarable/helpers/adjust_hue.h b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
index dda47ad99..3ccdfdd60 100644
--- a/libnd4j/include/ops/declarable/helpers/adjust_hue.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_hue.h
@@ -15,120 +15,204 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
 #include <NDArray.h>
 
-namespace nd4j {
-namespace ops {
+namespace nd4j    {
+namespace ops     {
 namespace helpers {
-    template <typename T>
-    static FORCEINLINE _CUDA_HD void rgb_to_hv(T r, T g, T b, T* h, T* v_min, T* v_max) {
-        T v_mid;
-        int h_category;
-        // According to the figures in:
-        // https://en.wikipedia.org/wiki/HSL_and_HSV#Hue_and_chroma
-        // For the conditions, we don't care about the case where two components are
-        // equal. It is okay to count it in either side in that case.
-        if (r < g) {
-            if (b < r) {
-            // b < r < g
-                *v_max = g;
-                v_mid = r;
-                *v_min = b;
-                h_category = 1;
-            } else if (b > g) {
-            // r < g < b
-                *v_max = b;
-                v_mid = g;
-                *v_min = r;
-                h_category = 3;
-            } else {
-            // r < b < g
-                *v_max = g;
-                v_mid = b;
-                *v_min = r;
-                h_category = 2;
-            }
+
+
+    void adjustHue(nd4j::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC);
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+FORCEINLINE _CUDA_HD void rgbToHsv(const T& r, const T& g, const T& b, T& h, T& s, T& v) {
+
+    // h values are in range [0, 360)
+    // s and v values are in range [0, 1]
+
+    const T max = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
+    const T min = nd4j::math::nd4j_min<T>(r, nd4j::math::nd4j_min<T>(g, b));
+    const T c  = max - min;
+
+    // calculate h
+    if(c == 0) {
+        h = 0;
+    }
+    else if(max == r) {
+        h = 60.f * ((g - b) / c) + (g >= b ? 0 : 360);
+    }
+    else if(max == g) {
+        h = 60.f * ((b - r) / c) + 120;
+    }
+    else { // max == b
+        h = 60.f * ((r - g) / c) + 240;
+    }
+
+    // calculate s
+    s = max == (T)0 ? (T)0 : c / max;
+
+    // calculate v
+    v = max / 255.f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+FORCEINLINE _CUDA_HD void hsvToRgb(const T& h, const T& s, const T& v, T& r, T& g, T& b) {
+
+    const float sector = h / 60.f;
+    const T c = v * s;
+
+    if(0.f <= sector && sector < 1.f) {
+        r = v;
+        g = v - c * (1 - sector);
+        b = v - c;
+    }
+    else if(1.f <= sector && sector < 2.f) {
+        r = v - c * (sector - 1);
+        g = v;
+        b = v - c;
+    }
+    else if(2.f <= sector && sector < 3.f) {
+        r = v - c;
+        g = v;
+        b = v - c * (3 - sector);
+    }
+    else if(3.f <= sector && sector < 4.f) {
+        r = v - c;
+        g = v - c * (sector - 3);
+        b = v;
+    }
+    else if(4.f <= sector && sector < 5.f) {
+        r = v - c * (5 - sector);
+        g = v - c;
+        b = v;
+    }
+    else {      // 5.f <= sector < 6.f
+        r = v;
+        g = v - c;
+        b = v - c * (sector - 5);
+    }
+
+    r *= 255;
+    g *= 255;
+    b *= 255;
+}
+
+/*////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+static FORCEINLINE _CUDA_HD void rgb_to_hv(T r, T g, T b, T* h, T* v_min, T* v_max) {
+    T v_mid;
+    int h_category;
+    // According to the figures in:
+    // https://en.wikipedia.org/wiki/HSL_and_HSV#Hue_and_chroma
+    // For the conditions, we don't care about the case where two components are
+    // equal. It is okay to count it in either side in that case.
+    if (r < g) {
+        if (b < r) {
+        // b < r < g
+            *v_max = g;
+            v_mid = r;
+            *v_min = b;
+            h_category = 1;
+        } else if (b > g) {
+        // r < g < b
+            *v_max = b;
+            v_mid = g;
+            *v_min = r;
+            h_category = 3;
         } else {
-        // g < r
-            if (b < g) {
-            // b < g < r
-                *v_max = r;
-                v_mid = g;
-                *v_min = b;
-                h_category = 0;
-            } else if (b > r) {
-            // g < r < b
-                *v_max = b;
-                v_mid = r;
-                *v_min = g;
-                h_category = 4;
-            } else {
-            // g < b < r
-                *v_max = r;
-                v_mid = b;
-                *v_min = g;
-                h_category = 5;
-            }
+        // r < b < g
+            *v_max = g;
+            v_mid = b;
+            *v_min = r;
+            h_category = 2;
         }
-        if (*v_max == *v_min) {
-            *h = 0;
-            return;
-        }
-        auto ratio = (v_mid - *v_min) / (*v_max - *v_min);
-        bool increase = ((h_category & 0x1) == 0);
-        *h = h_category + (increase ? ratio : (1 - ratio));
-    }
-
-    template <typename T>
-    static FORCEINLINE _CUDA_HD void hv_to_rgb(T h, T v_min, T v_max, T* r, T* g, T* b) {
-        int h_category = static_cast<int>(h);
-        T ratio = h - (T)h_category;
-        bool increase = ((h_category & 0x1) == 0);
-        if (!increase)
-            ratio = 1 - ratio;
-        
-        T v_mid = v_min + ratio * (v_max - v_min);
-        // According to the figures in:
-        // https://en.wikipedia.org/wiki/HSL_and_HSV#Hue_and_chroma
-        switch (h_category) {
-            case 0:
-                *r = v_max;
-                *g = v_mid;
-                *b = v_min;
-            break;
-            case 1:
-                *r = v_mid;
-                *g = v_max;
-                *b = v_min;
-            break;
-            case 2:
-                *r = v_min;
-                *g = v_max;
-                *b = v_mid;
-            break;
-            case 3:
-                *r = v_min;
-                *g = v_mid;
-                *b = v_max;
-            break;
-            case 4:
-                *r = v_mid;
-                *g = v_min;
-                *b = v_max;
-            break;
-            case 5:
-            default:
-                *r = v_max;
-                *g = v_min;
-                *b = v_mid;
+    } else {
+    // g < r
+        if (b < g) {
+        // b < g < r
+            *v_max = r;
+            v_mid = g;
+            *v_min = b;
+            h_category = 0;
+        } else if (b > r) {
+        // g < r < b
+            *v_max = b;
+            v_mid = r;
+            *v_min = g;
+            h_category = 4;
+        } else {
+        // g < b < r
+            *v_max = r;
+            v_mid = b;
+            *v_min = g;
+            h_category = 5;
         }
     }
+    if (*v_max == *v_min) {
+        *h = 0;
+        return;
+    }
+    auto ratio = (v_mid - *v_min) / (*v_max - *v_min);
+    bool increase = ((h_category & 0x1) == 0);
+    *h = h_category + (increase ? ratio : (1 - ratio));
+}
 
-    void _adjust_hue(nd4j::LaunchContext * context, NDArray *input, NDArray *output, NDArray *delta, bool isNHWC);
+////////////////////////////////////////////////////////////////////////////////
+template <typename T>
+static FORCEINLINE _CUDA_HD void hv_to_rgb(T h, T v_min, T v_max, T* r, T* g, T* b) {
+    int h_category = static_cast<int>(h);
+    T ratio = h - (T)h_category;
+    bool increase = ((h_category & 0x1) == 0);
+    if (!increase)
+        ratio = 1 - ratio;
+
+    T v_mid = v_min + ratio * (v_max - v_min);
+    // According to the figures in:
+    // https://en.wikipedia.org/wiki/HSL_and_HSV#Hue_and_chroma
+    switch (h_category) {
+        case 0:
+            *r = v_max;
+            *g = v_mid;
+            *b = v_min;
+        break;
+        case 1:
+            *r = v_mid;
+            *g = v_max;
+            *b = v_min;
+        break;
+        case 2:
+            *r = v_min;
+            *g = v_max;
+            *b = v_mid;
+        break;
+        case 3:
+            *r = v_min;
+            *g = v_mid;
+            *b = v_max;
+        break;
+        case 4:
+            *r = v_mid;
+            *g = v_min;
+            *b = v_max;
+        break;
+        case 5:
+        default:
+            *r = v_max;
+            *g = v_min;
+            *b = v_mid;
+    }
+}
+
+*/
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
index 503c45626..cd0930cc0 100644
--- a/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
+++ b/libnd4j/include/ops/declarable/helpers/adjust_saturation.h
@@ -15,16 +15,21 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
 #include <templatemath.h>
 #include <NDArray.h>
 
-namespace nd4j {
-namespace ops {
+namespace nd4j    {
+namespace ops     {
 namespace helpers {
+
+    void adjustSaturation(nd4j::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC);
+
+/*
     template <typename T>
     static FORCEINLINE _CUDA_HD void rgb_to_hsv(T r, T g, T b, T* h, T* s, T* v) {
         T vv = nd4j::math::nd4j_max<T>(r, nd4j::math::nd4j_max<T>(g, b));
@@ -63,10 +68,10 @@ namespace helpers {
         T fmodu = dh;
         while (fmodu <= (T) 0)
             fmodu += (T) 2.0f;
-        
+
         while (fmodu >= (T) 2.0f)
             fmodu -= (T) 2.0f;
-        
+
         T x = c * (1. - nd4j::math::nd4j_abs<T>(fmodu - 1.));
         switch (h_category) {
             case 0:
@@ -104,13 +109,13 @@ namespace helpers {
                 gg = 0;
                 bb = 0;
         }
-        
+
         *r = rr + m;
         *g = gg + m;
         *b = bb + m;
     }
+*/
 
-    void adjust_saturation(nd4j::LaunchContext * context, NDArray *input, NDArray *output, NDArray *delta, bool isNHWC);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index e25ad1b23..fa9ab7b40 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -15,107 +15,177 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <ops/declarable/helpers/adjust_hue.h>
+#include <helpers/ConstantTadHelper.h>
 
 namespace nd4j {
 namespace ops {
 namespace helpers {
 
-    template <typename T>
-    static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
-        // we're 100% sure it's 3
-        const int numChannels = 3;
-        int tuples = array->lengthOf() /  numChannels;
-        auto bIn = reinterpret_cast<T *>(array->buffer());
-        auto bOut = reinterpret_cast<T *>(output->buffer());
-        static const int kChannelRange = 6;
 
-        int stridesDim = isNHWC ? 2 : 0;
-        if (isNHWC) {
-            // for NHWC our rgb values are stored one by one
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < tuples; e++) {
-                auto i = bIn + e * numChannels;
-                auto o = bOut + e * numChannels;
+template <typename T>
+static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
 
-                T h, v_min, v_max;
-                helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
+    const T delta  = deltaScalarArr->e<T>(0);
+    const int rank = input->rankOf();
 
-                h += delta * kChannelRange;
-                while (h < (T) 0.)
-                    h += (T) kChannelRange;
-              
-                while (h >= (T) kChannelRange)
-                    h -= (T) kChannelRange;
+    const T* x = input->bufferAsT<T>();
+          T* z = output->bufferAsT<T>();
 
-                helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
-            }
-        } else {
-            auto tadsChannelsIn  = array->allTensorsAlongDimension({0});
-            auto tadsChannelsOut = output->allTensorsAlongDimension( {0});
+    if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
-            auto bufferR = reinterpret_cast<T *>(tadsChannelsIn->at(0)->buffer());
-            auto bufferG = reinterpret_cast<T *>(tadsChannelsIn->at(1)->buffer());
-            auto bufferB = reinterpret_cast<T *>(tadsChannelsIn->at(2)->buffer());
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) {
 
-            auto outputR = reinterpret_cast<T *>(tadsChannelsOut->at(0)->buffer());
-            auto outputG = reinterpret_cast<T *>(tadsChannelsOut->at(1)->buffer());
-            auto outputB = reinterpret_cast<T *>(tadsChannelsOut->at(2)->buffer());
+            T h, s, v;
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < tuples; e++) {
-                auto _ri = bufferR + e;
-                auto _gi = bufferG + e;
-                auto _bi = bufferB + e;
+            rgbToHsv<T>(x[i], x[i+1], x[i+2], h, s, v);
 
-                auto _ro = outputR + e;
-                auto _go = outputG + e;
-                auto _bo = outputB + e;
+            h += delta * 360;
+            if(h > 360)
+                h -= 360;
+            else if(h < 0)
+                h += 360;
 
-                T h, v_min, v_max;
-                helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
-
-                h += delta * kChannelRange;
-                while (h < (T) 0)
-                    h += (T) kChannelRange;
-              
-                while (h >= (T) kChannelRange)
-                    h -= (T) kChannelRange;
-
-                helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
-            }
-
-            delete tadsChannelsIn;
-            delete tadsChannelsOut;
+            hsvToRgb<T>(h, s, v, z[i], z[i+1], z[i+2]);
         }
     }
+    else {
 
-    void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
-        auto xType = array->dataType();
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
 
-        float d = delta->e<float>(0);
-        if (array->rankOf() == 4) {
-            auto tadsIn = array->allTensorsAlongDimension({0});
-            auto tadsOut = output->allTensorsAlongDimension({0});
-            int tSize = tadsIn->size();
-            // FIXME: template selector should be moved out of loop
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int e = 0; e < tSize; e++) {
-                BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
-            }
-            
+        const Nd4jLong numOfTads   = packX.numberOfTads();
+        const Nd4jLong xDimCstride = input->stridesOf()[dimC];
+        const Nd4jLong zDimCstride = output->stridesOf()[dimC];
+
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for(Nd4jLong i = 0; i < numOfTads; ++i) {
+
+            const T* xTad = x + packX.platformOffsets()[i];
+                  T* zTad = z + packZ.platformOffsets()[i];
+
+            T h, s, v;
+
+            rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+
+            h += delta * 360;
+            if(h > 360)
+                h -= 360;
+            else if(h < 0)
+                h += 360;
+
+            hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
 
-            delete tadsIn;
-            delete tadsOut;
-        } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
         }
     }
+}
+
+
+void adjustHue(nd4j::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
+
+    BUILD_SINGLE_SELECTOR(input->dataType(), adjustHue_, (input, deltaScalarArr, output, dimC), LIBND4J_TYPES);
+}
+
+/*
+template <typename T>
+static void adjust_hue_single_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    // we're 100% sure it's 3
+    const int numChannels = 3;
+    int tuples = array->lengthOf() /  numChannels;
+    auto bIn = reinterpret_cast<T *>(array->buffer());
+    auto bOut = reinterpret_cast<T *>(output->buffer());
+    static const int kChannelRange = 6;
+
+    int stridesDim = isNHWC ? 2 : 0;
+    if (isNHWC) {
+        // for NHWC our rgb values are stored one by one
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for (int e = 0; e < tuples; e++) {
+            auto i = bIn + e * numChannels;
+            auto o = bOut + e * numChannels;
+
+            T h, v_min, v_max;
+            helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
+
+            h += delta * kChannelRange;
+            while (h < (T) 0.)
+                h += (T) kChannelRange;
+
+            while (h >= (T) kChannelRange)
+                h -= (T) kChannelRange;
+
+            helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
+        }
+    } else {
+        auto tadsChannelsIn  = array->allTensorsAlongDimension({0});
+        auto tadsChannelsOut = output->allTensorsAlongDimension( {0});
+
+        auto bufferR = reinterpret_cast<T *>(tadsChannelsIn->at(0)->buffer());
+        auto bufferG = reinterpret_cast<T *>(tadsChannelsIn->at(1)->buffer());
+        auto bufferB = reinterpret_cast<T *>(tadsChannelsIn->at(2)->buffer());
+
+        auto outputR = reinterpret_cast<T *>(tadsChannelsOut->at(0)->buffer());
+        auto outputG = reinterpret_cast<T *>(tadsChannelsOut->at(1)->buffer());
+        auto outputB = reinterpret_cast<T *>(tadsChannelsOut->at(2)->buffer());
+
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for (int e = 0; e < tuples; e++) {
+            auto _ri = bufferR + e;
+            auto _gi = bufferG + e;
+            auto _bi = bufferB + e;
+
+            auto _ro = outputR + e;
+            auto _go = outputG + e;
+            auto _bo = outputB + e;
+
+            T h, v_min, v_max;
+            helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
+
+            h += delta * kChannelRange;
+            while (h < (T) 0)
+                h += (T) kChannelRange;
+
+            while (h >= (T) kChannelRange)
+                h -= (T) kChannelRange;
+
+            helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
+        }
+
+        delete tadsChannelsIn;
+        delete tadsChannelsOut;
+    }
+}
+
+void adjust_hue_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+    auto xType = array->dataType();
+
+    float d = delta->e<float>(0);
+    if (array->rankOf() == 4) {
+        auto tadsIn = array->allTensorsAlongDimension({0});
+        auto tadsOut = output->allTensorsAlongDimension({0});
+        int tSize = tadsIn->size();
+        // FIXME: template selector should be moved out of loop
+        PRAGMA_OMP_PARALLEL_FOR
+        for (int e = 0; e < tSize; e++) {
+            BUILD_SINGLE_SELECTOR(xType, adjust_hue_single_, (context, tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
+        }
+
+
+        delete tadsIn;
+        delete tadsOut;
+    } else {
+        BUILD_SINGLE_SELECTOR(xType, adjust_hue_single_, (context, array, output, d, isNHWC);, FLOAT_TYPES);
+    }
+}
+
+BUILD_SINGLE_TEMPLATE(template void adjust_hue_single_, (nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
+*/
 
-    BUILD_SINGLE_TEMPLATE(template void _adjust_hue_single, (nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC);, FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index 58e5c7831..d01a8e2be 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -15,99 +15,168 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <ops/declarable/helpers/adjust_saturation.h>
+#include <ops/declarable/helpers/adjust_hue.h>
+#include <helpers/ConstantTadHelper.h>
 
 
-namespace nd4j {
-namespace ops {
+namespace nd4j    {
+namespace ops     {
 namespace helpers {
 
-    template <typename T>
-    static void adjust_saturation_single_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
-        // we're 100% sure it's 3
-        const int numChannels = 3;
-        int tuples = array->lengthOf() /  numChannels;
-        auto bIn = reinterpret_cast<T *>(array->buffer());
-        auto bOut = reinterpret_cast<T *>(output->buffer());
-        static const int kChannelRange = 6;
+template <typename T>
+static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
 
-        if (isNHWC) {
-            // for NHWC our rgb values are stored one by one
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < tuples; e++) {
-                auto i = bIn + e * numChannels;
-                auto o = bOut + e * numChannels;
+    const T factor = factorScalarArr->e<T>(0);
+    const int rank = input->rankOf();
 
-                T h, s, v;
-                // Convert the RGB color to Hue/V-range.
-                helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
-                s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
-                // Convert the hue and v-range back into RGB.
-                helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
-            }
-        } else {
-            auto tadsChannelsIn = array->allTensorsAlongDimension({0});
-            auto tadsChannelsOut = output->allTensorsAlongDimension({0});
+    const T* x = input->bufferAsT<T>();
+          T* z = output->bufferAsT<T>();
 
-            auto bufferR = reinterpret_cast<T *>(tadsChannelsIn->at(0)->buffer());
-            auto bufferG = reinterpret_cast<T *>(tadsChannelsIn->at(1)->buffer());
-            auto bufferB = reinterpret_cast<T *>(tadsChannelsIn->at(2)->buffer());
+    if(dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
-            auto outputR = reinterpret_cast<T *>(tadsChannelsOut->at(0)->buffer());
-            auto outputG = reinterpret_cast<T *>(tadsChannelsOut->at(1)->buffer());
-            auto outputB = reinterpret_cast<T *>(tadsChannelsOut->at(2)->buffer());
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for (Nd4jLong i = 0; i < input->lengthOf(); i += 3) {
 
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (int e = 0; e < tuples; e++) {
-                auto _ri = bufferR + e;
-                auto _gi = bufferG + e;
-                auto _bi = bufferB + e;
+            T h, s, v;
 
-                auto _ro = outputR + e;
-                auto _go = outputG + e;
-                auto _bo = outputB + e;
+            rgbToHsv<T>(x[i], x[i+1], x[i+2], h, s, v);
 
-                T h, s, v;
-                // Convert the RGB color to Hue/V-range.
-                helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
-                s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
-                // Convert the hue and v-range back into RGB.
-                helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
-            }
+            s *= factor;
+            if(s > 1.f)
+                s = 1.f;
+            else if(s < 0.f)
+                s = 0.f;
 
-            delete tadsChannelsIn;
-            delete tadsChannelsOut;
+            hsvToRgb<T>(h, s, v, z[i], z[i+1], z[i+2]);
         }
     }
+    else {
 
-    void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
-        auto xType = array->dataType();
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
 
-        float d = delta->e<float>(0);
-        if (array->rankOf() == 4) {
-            auto tadsIn = array->allTensorsAlongDimension({0});
-            auto tadsOut = output->allTensorsAlongDimension({0});
-            int tSize = tadsIn->size();
+        const Nd4jLong numOfTads   = packX.numberOfTads();
+        const Nd4jLong xDimCstride = input->stridesOf()[dimC];
+        const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
-            // FIXME: template selector should be moved out of loop
-            PRAGMA_OMP_PARALLEL_FOR
-            for (int e = 0; e < tSize; e++) {
-                BUILD_SINGLE_SELECTOR(xType, adjust_saturation_single_, (context, tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
-            }
-            
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for(Nd4jLong i = 0; i < numOfTads; ++i) {
+
+            const T* xTad = x + packX.platformOffsets()[i];
+                  T* zTad = z + packZ.platformOffsets()[i];
+
+            T h, s, v;
+
+            rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+
+            s *= factor;
+            if(s > 1.f)
+                s = 1.f;
+            else if(s < 0.f)
+                s = 0.f;
+
+            hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
 
-            delete tadsIn;
-            delete tadsOut;
-        }
-        else {
-            BUILD_SINGLE_SELECTOR(xType, adjust_saturation_single_, (context, array, output, d, isNHWC);, FLOAT_TYPES);
         }
     }
+}
 
-    BUILD_SINGLE_TEMPLATE(template void adjust_saturation_single_, (nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);
+
+void adjustSaturation(nd4j::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
+
+    BUILD_SINGLE_SELECTOR(input->dataType(), adjustSaturation_, (input, factorScalarArr, output, dimC), LIBND4J_TYPES);
+}
+
+/*
+template <typename T>
+static void adjust_saturation_single_(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    // we're 100% sure it's 3
+    const int numChannels = 3;
+    int tuples = array->lengthOf() /  numChannels;
+    auto bIn = reinterpret_cast<T *>(array->buffer());
+    auto bOut = reinterpret_cast<T *>(output->buffer());
+    static const int kChannelRange = 6;
+
+    if (isNHWC) {
+        // for NHWC our rgb values are stored one by one
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for (int e = 0; e < tuples; e++) {
+            auto i = bIn + e * numChannels;
+            auto o = bOut + e * numChannels;
+
+            T h, s, v;
+            // Convert the RGB color to Hue/V-range.
+            helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
+            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+            // Convert the hue and v-range back into RGB.
+            helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
+        }
+    } else {
+        auto tadsChannelsIn = array->allTensorsAlongDimension({0});
+        auto tadsChannelsOut = output->allTensorsAlongDimension({0});
+
+        auto bufferR = reinterpret_cast<T *>(tadsChannelsIn->at(0)->buffer());
+        auto bufferG = reinterpret_cast<T *>(tadsChannelsIn->at(1)->buffer());
+        auto bufferB = reinterpret_cast<T *>(tadsChannelsIn->at(2)->buffer());
+
+        auto outputR = reinterpret_cast<T *>(tadsChannelsOut->at(0)->buffer());
+        auto outputG = reinterpret_cast<T *>(tadsChannelsOut->at(1)->buffer());
+        auto outputB = reinterpret_cast<T *>(tadsChannelsOut->at(2)->buffer());
+
+        PRAGMA_OMP_PARALLEL_FOR_SIMD
+        for (int e = 0; e < tuples; e++) {
+            auto _ri = bufferR + e;
+            auto _gi = bufferG + e;
+            auto _bi = bufferB + e;
+
+            auto _ro = outputR + e;
+            auto _go = outputG + e;
+            auto _bo = outputB + e;
+
+            T h, s, v;
+            // Convert the RGB color to Hue/V-range.
+            helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
+            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+            // Convert the hue and v-range back into RGB.
+            helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
+        }
+
+        delete tadsChannelsIn;
+        delete tadsChannelsOut;
+    }
+}
+
+void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+    auto xType = array->dataType();
+
+    float d = delta->e<float>(0);
+    if (array->rankOf() == 4) {
+        auto tadsIn = array->allTensorsAlongDimension({0});
+        auto tadsOut = output->allTensorsAlongDimension({0});
+        int tSize = tadsIn->size();
+
+        // FIXME: template selector should be moved out of loop
+        PRAGMA_OMP_PARALLEL_FOR
+        for (int e = 0; e < tSize; e++) {
+            BUILD_SINGLE_SELECTOR(xType, adjust_saturation_single_, (context, tadsIn->at(e), tadsOut->at(e), d, isNHWC);, FLOAT_TYPES);
+        }
+
+
+        delete tadsIn;
+        delete tadsOut;
+    }
+    else {
+        BUILD_SINGLE_SELECTOR(xType, adjust_saturation_single_, (context, array, output, d, isNHWC);, FLOAT_TYPES);
+    }
+}
+
+BUILD_SINGLE_TEMPLATE(template void adjust_saturation_single_, (nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC), FLOAT_TYPES);
+*/
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index 6866aed5f..7b40d0fa7 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -59,13 +59,16 @@ namespace helpers {
             std::vector<Nd4jLong> dims(reduceShape->lengthOf());
 
             bool fit = true;
-
-            for( int i = 0; fit && (i < dims.size()); i++ ) {
-                dims[i] = reduceShape->e<Nd4jLong>(i);
-                for (int e = 0; fit && (e < input->rankOf()); ++e)
-                    if (input->sizeAt(e) % dims[i]) {
-                        fit = false;
-                    }
+            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
+            for( int i = 0; i < dims.size(); i++ ) {
+                if (fit) {
+                    dims[i] = reduceShape->e<Nd4jLong>(i);
+                    for (int e = 0; e < input->rankOf(); ++e)
+                        if (fit)
+                        if (input->sizeAt(e) % dims[i]) {
+                            fit = false;
+                        }
+                }
             }
 
             // check dims to fit input
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
index 59c18d064..9799e609d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gru.cpp
@@ -35,82 +35,88 @@ namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wru, const NDArray* Wc,
-             const NDArray* bru, const NDArray* bc,
+void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc,
+             const NDArray* b, const NDArray* bc,
              NDArray* r, NDArray* u, NDArray* c, NDArray* h) {
 
     //Inputs:
-    // x        input [bS, nIn], nIn - input size
-    // hLast    previous cell output [bS, nUn],  that is at previous time step t-1, nUn - number of units
-    // Wru      RU weights - [nIn+nUn, 2*nUn] - reset and update gates
-    // Wc       C weights - [nIn+nUn, nUn] - cell gate
-    // bru      r and u biases, [2*nUn] - reset and update gates
-    // bc       c biases, [nUn] - cell gate
+    // x        input [bS, iS], iS - input size
+    // hLast    previous cell output [bS, nU],  that is at previous time step t-1, nU - number of units
+    // W        RU weights - [iS+nU, 2*nU] - reset and update gates
+    // Wc       C weights - [iS+nU, nU] - cell gate
+    // b        r and u biases, [2*nU] - reset and update gates
+    // bc       c biases, [nU] - cell gate
 
     //Outputs:
-    // r        Reset gate output [bS, nUn]
-    // u        Update gate output [bS, nUn]
-    // c        Cell gate output [bS, nUn]
-    // h        current cell output [bS, nUn]
+    // r        Reset gate output [bS, nU]
+    // u        Update gate output [bS, nU]
+    // c        Cell gate output [bS, nU]
+    // h        current cell output [bS, nU]
 
     /***************************************************************************************/
     /************************ THIS IS NOT OPTIMAZED CODE ***********************************/
     /** however it is more math-friendly and convenient for backprop formulas derivation) **/
 
     const int bS  = x->sizeAt(0);
-    const int nIn = x->sizeAt(1);
-    const int nUn = hLast->sizeAt(1);
+    const int iS = x->sizeAt(1);
+    const int nU = hLast->sizeAt(1);
 
-    NDArray Wr = (*Wru)({0,nIn,       0,0});       // reset gates weights   [nIn, 2*nUn]
-    NDArray Wu = (*Wru)({nIn,nIn+nUn, 0,0});       // updates gates weights [nUn, 2*nUn]
+    NDArray Wrx = (*W)({0,iS,     0,nU});       // [iS, nU]
+    NDArray Wux = (*W)({0,iS,     nU,2*nU});    // [iS, nU]
+    NDArray Wrh = (*W)({iS,iS+nU, 0,nU});       // [nU, nU]
+    NDArray Wuh = (*W)({iS,iS+nU, nU,2*nU});    // [nU, nU]
 
-    NDArray Wcr = (*Wc)({0,nIn,       0,0});       // reset cell weights    [nIn, nUn]
-    NDArray Wcu = (*Wc)({nIn,nIn+nUn, 0,0});       // updates cell weights  [nUn, nUn]
+    NDArray Wcx = (*Wc)({0,iS,     0,0});       // reset cell weights    [iS, nU]
+    NDArray Wch = (*Wc)({iS,iS+nU, 0,0});       // updates cell weights  [nU, nU]
 
-    // gates = sigmoid(x*Wr + hLast*Wu + br + bu)
-    NDArray gates = mmul(*x, Wr) + mmul(*hLast, Wu) + *bru;    // [bS, nIn] * [nIn, 2*nUn] + [bS, nUn] * [nUn, 2*nUn] + [2*nUn] = [bS, 2*nUn]
-    gates.applyTransform(transform::Sigmoid);
+    NDArray br = (*b)({0,  nU});                // [nU]
+    NDArray bu = (*b)({nU, 2*nU});              // [nU]
+
+    // × means matrix multipication
+    // * means element-wise product or so called Hadamard product
 
     // reset gate
-    r->assign(gates({0,0, 0,nUn}));               // [bS, nUn]
+    r->assign(mmul(*x, Wrx) + mmul(*hLast, Wrh) + br);         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    r->applyTransform(transform::Sigmoid);
 
     // update gate
-    u->assign(gates({0,0, nUn,2*nUn}));            // [bS, nUn]
+    u->assign(mmul(*x, Wux) + mmul(*hLast, Wuh) + bu);         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    u->applyTransform(transform::Sigmoid);
 
-    // cell gate c = activation(x*Wcr + (r◦hlast)*Wcu + bc)
-    c->assign(mmul(*x, Wcr) + mmul(*r * *hLast, Wcu) + *bc);    // [bS, nIn] * [nIn, nUn] + [bS, nUn] * [nUn, nUn] + [nUn] = [bS, nUn]
+    // cell gate c = activation(x × Wcx + (r * hlast) × Wch + bc)
+    c->assign(mmul(*x, Wcx) + mmul(*r * *hLast, Wch) + *bc);    // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
     c->applyTransform(transform::Tanh);
 
+    NDArray temp = 1.f - *c * *c;
+
     // cell output
     h->assign(*u * *hLast + (1.f - *u) * *c);
 
 
-
-
     /***************************************************************************************/
-    /********************** THIS MORE OPTIMAZED CODE (except concat ) **********************/
+    /*************** THIS IS MORE OPTIMAZED CODE (should think about concat) ***************/
     /***************************************************************************************/
 /*
-    //Concat inputs: x + hLast : [bs, nIn + nUn]
-    NDArray xhConcat(x->ordering(), {bS, nIn + nUn}, x->dataType(), context);  // concat([bs, nIn], [bs, nUn]) -> [bs, nIn + nUn]
+    //Concat inputs: x + hLast : [bs, iS + nU]
+    NDArray xhConcat(x->ordering(), {bS, iS + nU}, x->dataType(), context);  // concat([bs, iS], [bs, nU]) -> [bs, iS + nU]
     helpers::concat(context, {const_cast<NDArray*>(x), const_cast<NDArray*>(hLast)},  xhConcat, {1});
 
-    //mmul for reset and update gates: (x * weight_ux + hLast * weight_xr + b_u)
-    auto m = mmul(xhConcat, *Wru) + *bru ;    // [bs, nIn+nUn] * [nIn+nUn, 2*nUn] = [bs, 2*nUn]
+    //mmul for reset and update gates: (x × weight_ux + hLast × weight_xr + b_u)
+    auto m = mmul(xhConcat, *W) + *b ;    // [bs, iS+nU] * [iS+nU, 2*nU] = [bs, 2*nU]
     // m += *bru;
 
-    sigmoidInplace(m);  //sigmoid(rz) and sigmoid(uz)
+    m.applyTransform(transform::Sigmoid);  //sigmoid(rz) and sigmoid(uz)
 
-    r->assign(m({0,0, 0, nUn}));
-    u->assign(m({0,0, nUn, 2*nUn}));
+    r->assign(m({0,0, 0, nU}));
+    u->assign(m({0,0, nU, 2*nU}));
 
     // hLast = hLast * r
-    xhConcat({0,0, nIn, nIn+nUn}) *= *r;
+    xhConcat({0,0, iS, iS+nU}) *= *r;
 
-    //c = tanh(x * weight_cx + (hLast .* r) * weight_cr + b_c)
+    //c = tanh(x × weight_cx + (hLast * r) × weight_cr + b_c)
     MmulHelper::mmul(&xhConcat, Wc, c, 1.0, 0.0);       //c = 1.0 * xhConcat * Wc + 0.0 * c
     *c += *bc;
-    tanhInplace(*c);
+    c->applyTransform(transform::Tanh);
 
     //Output: h = (1-u).*c + u .* hPrev
     //auto hResult = (*u) * (*hLast) + (1.0f - *u) * (*c); const_cast<NDArray*>(h)->assign(&hResult);
@@ -122,19 +128,19 @@ void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLa
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
+void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
 
     // x   input [time, bS, iS]
-    // h0  initial cell output (at time step = 0) [bS, nUn]
-    // Wx  input-to-hidden  weights, [iS, 3*nUn]
-    // Wh  hidden-to-hidden weights, [nUn, 3*nUn]
-    // b   biases, [3*nUn]
+    // hLast  initial cell output (at time step = 0) [bS, nU]
+    // Wx  input-to-hidden  weights, [iS, 3*nU]
+    // Wh  hidden-to-hidden weights, [nU, 3*nU]
+    // b   biases, [3*nU]
 
-    // h is cell outputs at each time step [time, bS, nUn]
+    // h is cell outputs at each time step [time, bS, nU]
 
     const int time = x->sizeAt(0);
 
-    NDArray ht_1(*h0);
+    NDArray ht_1(*hLast);
 
     // loop through time steps
     for (int t = 0; t < time; ++t) {
@@ -142,111 +148,214 @@ void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray*
         auto xt = (*x)({t,t+1, 0,0, 0,0});
         auto ht = (*h)({t,t+1, 0,0, 0,0});
 
-        //helpers::gruCell(&xt, &ht_1, Wx, Wh, b, &ht);
-        //ht_1.assign(ht);
+        // helpers::gruCell(&xt, &ht_1, Wx, Wh, b, &ht);
+        // ht_1.assign(ht);
     }
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruCellBP(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* dLdh, const NDArray* dLdWx0,
-           const NDArray* dLdWh0, const NDArray* dLdb0, NDArray* dLdx, NDArray* dLdh0, NDArray* dLdWx, NDArray* dLdWh, NDArray* dLdb) {
+void gruCellBP(nd4j::LaunchContext* context,
+              const NDArray* x,    const NDArray* hLast,
+              const NDArray* W,    const NDArray* Wc,        const NDArray* b,    const NDArray* bc,
+              const NDArray* dLdr, const NDArray* dLdu,      const NDArray* dLdc, const NDArray* dLdh,
+                    NDArray* dLdx,       NDArray* dLdhLast,
+                    NDArray* dLdW,       NDArray* dLdWc,
+                    NDArray* dLdb,       NDArray* dLdbc) {
 
-    // x                        input [bS, iS]
-    // h0                       previous cell output [bS, nUn],  that is at previous time step t-1
-    // Wx                       input-to-hidden  weights, [iS, 3*nUn]
-    // Wh                       hidden-to-hidden weights, [nUn, 3*nUn]
-    // b                        biases, [3*nUn]
-    // dLdh                     gradient wrt output, [bS,nUn], that is epsilon_next
-    // dLdWx0                   gradient wrt Wx at previous time step, [iS, 3*nUn]
-    // dLdWh0                   gradient wrt Wh at previous time step, [nUn, 3*nUn]
-    // dLdb0                    gradient wrt b at previous time step,  [3*nUn]
+    //Inputs:
+    // x              input [bS, iS]
+    // hLast          previous cell output [bS, nU],  that is at previous time step t-1
+    // W              weights - [iS+nU, 2*nU] - reset and update gates
+    // Wc             C weights - [iS+nU, nU] - cell gate
+    // b              r and u biases, [2*nU] - reset and update gates
+    // bc             c biases, [nU] - cell gate
+    // dLdr           gradient wrt reset gate, [bS, nU]
+    // dLdu           gradient wrt update gate, [bS, nU]
+    // dLdc           gradient wrt cell state, [bS, nU]
+    // dLdh           gradient wrt current cell output, [bS, nU]
 
-    // dLdx                   gradient wrt x,  [bS, iS], that is epsilon
-    // dLdh0                  gradient wrt h0, [bS, nUn]
-    // dLdWx                  gradient wrt Wx, [iS, 3*nUn]
-    // dLdWh                  gradient wrt Wh, [nUn, 3*nUn]
-    // dLdb                   gradient wrt b at previous time step,  [3*nUn]
+    //Outputs:
+    // dLdx           gradient wrt x,  [bS, iS],
+    // dLdhLast       gradient wrt hLast, [bS, nU]
+    // dLdW           gradient wrt W,  [iS+nU, 2*nU]
+    // dLdWc          gradient wrt Wc, [iS+nU, nU]
+    // dLdb           gradient wrt bru [2*nU]
+    // dLdbc          gradient wrt bc  [nU]
 
-    // h is current cell output [bS, nUn], that is at current time step t
+    // * means element-wise product or so called Hadamard product
+    // × means matrix multiplication
+
+    /************************************************************************************************/
+    /******************************* THIS IS NOT OPTIMAZED CODE *************************************/
+    /*** aim is to have math-readable code in order to keep track of backprop formulas derivation ***/
+
+    const int bS  = x->sizeAt(0);
+    const int iS = x->sizeAt(1);
+    const int nU = hLast->sizeAt(1);
+
+    NDArray xT     = x->transpose();            // [iS, bS]
+    NDArray hLastT = hLast->transpose();        // [nU, bS]
+
+    NDArray Wrx = (*W)({0,iS,     0,nU});       // [iS, nU]
+    NDArray Wux = (*W)({0,iS,     nU,2*nU});    // [iS, nU]
+    NDArray Wrh = (*W)({iS,iS+nU, 0,nU});       // [nU, nU]
+    NDArray Wuh = (*W)({iS,iS+nU, nU,2*nU});    // [nU, nU]
+
+    NDArray Wcx = (*Wc)({0,iS,     0,0});       // reset cell weights    [iS, nU]
+    NDArray Wch = (*Wc)({iS,iS+nU, 0,0});       // updates cell weights  [nU, nU]
+
+    NDArray br = (*b)({0,  nU});                // [nU]
+    NDArray bu = (*b)({nU, 2*nU});              // [nU]
+
+    NDArray WrxT = Wrx.transpose();             // [nU, iS]
+    NDArray WuxT = Wux.transpose();             // [nU, iS]
+    NDArray WrhT = Wrh.transpose();             // [nU, nU]
+    NDArray WuhT = Wuh.transpose();             // [nU, nU]
+
+    NDArray WcxT = Wcx.transpose();             // [nU, iS]
+    NDArray WchT = Wch.transpose();             // [nU, nU]
+
+    NDArray dLdWrx = (*dLdW)({0,iS,     0,nU});     // [iS, nU]
+    NDArray dLdWux = (*dLdW)({0,iS,     nU,2*nU});  // [iS, nU]
+    NDArray dLdWrh = (*dLdW)({iS,iS+nU, 0,nU});     // [nU, nU]
+    NDArray dLdWuh = (*dLdW)({iS,iS+nU, nU,2*nU});  // [nU, nU]
+
+    NDArray dLdWcx = (*dLdWc)({0,iS,     0,0});     // [iS, nU]
+    NDArray dLdWch = (*dLdWc)({iS,iS+nU, 0,0});     // [nU, nU]
+
+    NDArray dLdbr = (*dLdb)({0,  nU});              // [nU]
+    NDArray dLdbu = (*dLdb)({nU, 2*nU});            // [nU]
 
-    const int nUn = h0->sizeAt(1);
 
     // ***** feed forward step ***** //
-    // gates = sigmoid(x*Wx + h0*Wh + b)
-    auto gates = sigmoid(mmul(*x, (*Wx)({0,0, 0,2*nUn})) + mmul(*h0, (*Wh)({0,0, 0,2*nUn})) + (*b)({0,2*nUn}));       // [bS, 2*nUn] + [bS, 2*nUn] + [1, 2*nUn] = [bS, 2*nUn]
+
     // reset gate
-    auto r = gates({0,0, 0, nUn});               // [bS, nUn]
+    NDArray r = mmul(*x, Wrx) + mmul(*hLast, Wrh) + br;         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    r.applyTransform(transform::Sigmoid);
+
     // update gate
-    auto u = gates({0,0, nUn, 2*nUn});            // [bS, nUn]
-    // ◦ means element-wise product or so called Hadamard product
-    // n = tanh(x*Wx + (r◦h0)*Wh + b)
-    auto n = tanh(mmul(*x, (*Wx)({0,0, 2*nUn,3*nUn})) + mmul((*h0)*r, (*Wh)({0,0, 2*nUn,3*nUn})) + (*b)({2*nUn,3*nUn}));     // [bS, nUn]
+    NDArray u = mmul(*x, Wux) + mmul(*hLast, Wuh) + bu;         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    u.applyTransform(transform::Sigmoid);
+
+    // cell gate c = activation(x×Wcx + (r*hlast)×Wcu + bc)
+    NDArray c = mmul(*x, Wcx) + mmul(r * *hLast, Wch) + *bc;    // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    c.applyTransform(transform::Tanh);
+
+    // h = (1 - u) * c + u * hPrev
+
 
     // ***** back prop step ***** //
-    auto Wxr  = (*Wx)({0,0, 0,   nUn});
-    auto Wxu  = (*Wx)({0,0, nUn,  2*nUn});
-    auto Wxn  = (*Wx)({0,0, 2*nUn,3*nUn});
-    auto Whr  = (*Wh)({0,0, 0,   nUn});
-    auto Whu  = (*Wh)({0,0, nUn,  2*nUn});
-    auto Whn  = (*Wh)({0,0, 2*nUn,3*nUn});
-    auto WxrT = Wxr.transpose();
-    auto WxuT = Wxu.transpose();
-    auto WxnT = Wxn.transpose();
-    auto WhrT = Whr.transpose();
-    auto WhuT = Whu.transpose();
-    auto WhnT = Whn.transpose();
-    auto xT   = x->transpose();
-    auto h0T  = h0->transpose();
 
-    auto dLdWxr = (*dLdWx)({0,0, 0,     nUn});
-    auto dLdWxu = (*dLdWx)({0,0, nUn,  2*nUn});
-    auto dLdWxn = (*dLdWx)({0,0, 2*nUn,3*nUn});
+    // notations:
+    // Zr = x × Wrx + hLast × Wrh + br
+    // Zu = x × Wux + hLast × Wuh + bu
+    // Sr = sigmoid(Zr)
+    // Su = sigmoid(Zu)
+    // Zc = x × Wcx + (r * hlast) × Wch + bc
 
-    auto dLdWhr = (*dLdWh)({0,0, 0,     nUn});
-    auto dLdWhu = (*dLdWh)({0,0, nUn,  2*nUn});
-    auto dLdWhn = (*dLdWh)({0,0, 2*nUn,3*nUn});
 
-    auto dLdbr = (*dLdb)({0,     nUn});
-    auto dLdbu = (*dLdb)({nUn,  2*nUn});
-    auto dLdbn = (*dLdb)({2*nUn,3*nUn});
+    // dLdx = dLdh * dhdx = dLdh * (dhdu * dudx + dhdc * dcdx) = (dLdh * dhdu) * dudx + (dLdh * dhdc) * dcdx = dLdu * dudx + dLdc * dcdx
+    //      = dLdx_u + dLdx_c
+    // dLdx_u = dLdu * dudx = dLdu * dudZu * dZudx = |dZudx = ... × WuxT| = (dLdu * dudZu) × WuxT
+    // dLdx_c = dLdc * dcdx = dLdc * dcdZc * (dZcdx + dZcdr * drdx) = dLdc * dcdZc * dZcdx + dLdc * dcdZc * dZcdr * drdx = dLdx_c0 + dLdx_c1
+    // dLdx_c0 = dLdc * dcdZc * dZcdx = |dZcdx = ... × WcxT| = (dLdc * dcdZc) × WcxT
+    // dZcdr = (... * hLast) × WchT
+    // dLdc * dcdZc * dZcdr = dLdr = (dLdc * dcdZc * hLast) × WchT
+    // drdx = drdZr * dZrdx
+    // dZrdx = ... × WrxT
+    // dLdx_c1 = dLdc * dcdZc * dZcdr * drdx = dLdr * drdx = (dLdr * drdZr) × WrxT
+    // finally dLdx = dLdx_u + dLdx_c0 + dLdx_c1 = (dLdu * dudZu) × WuxT + (dLdc * dcdZc) × WcxT + (dLdr * drdZr) × WrxT
 
-    auto dhdu   = *h0  - n;              // [bS, nUn]
-    auto dhdn   = 1.f - u;               // [bS, nUn]
-    auto dSigdu = u * (1.f - u);         // [bS, nUn]
-    auto dSigdr = r * (1.f - r);         // [bS, nUn]
-    auto dActdn = 1.f - n * n;           // [bS, nUn]
-    auto dndr   = mmul(dActdn * (*h0), WhnT);
-    auto drdh0  = mmul(dSigdr, WhrT);
 
-    auto dLdn = (*dLdh) * dhdn;
-    auto dLdu = (*dLdh) * dhdu;
-    auto dLdr = dLdn * dndr;
+    // dLdhLast    = dLdh * (dhdhLast + dhdu * dudhLast + dhdc * dcdhLast) = dLdh * dhdhLast + dLdu * dudhLast + dLdc * dcdhLast
+    //             = dLdhLast_h + dLdhLast_u + dLdhLast_c
+    // dLdhLast_h  = dLdh * dhdhLas = dLdh * u
+    // dLdhLast_u  = dLdu * dudhLast = |dudhLast = dudZu * dZudhLast , dZudhLast = ... × WuhT| = (dLdu * dudZu) × WuhT
+    // dLdhLast_c  = dLdc * dcdhLast  = dLdc * (dcdZc * dZcdhLast + dcdZc * dZcdr * drdhLast) =
+    //             = dLdc * dcdZc * dZcdhLast + dLdc * dcdZc * dZcdr * drdhLast =
+    //             = dLdc * dcdZc * dZcdhLast + dLdr * drdhLast = dLdhLast_c0 + dLdhLast_c1
+    // dLdhLast_c0 = dLdc * dcdZc * dZcdhLast = |dZcdhLast = (... * r) × WchT| = (dLdc * dcdZc * r) × WchT
+    // dLdhLast_c1 = dLdr * drdhLast = |drdhLast  = drdZr * dZrdhLast, dZrdhLast = ... × WrhT| = (dLdr * drdZr) × WrhT
+    // finally dLdhLast = dLdhLast_h + dLdhLast_u + dLdhLast_c0 + dLdhLast_c1 =
+    //                  = dLdh * u + (dLdu * dudZu) × WuhT + (dLdc * dcdZc * r) × WchT + (dLdr * drdZr) × WrhT
 
-    dLdx->assign( mmul(dLdu * dSigdu, WxuT) + mmul(dLdr * dSigdr, WxrT) + mmul(dLdn * dActdn, WxnT) );      // [bS,iS]
-    dLdh0->assign( mmul(dLdu * dSigdu, WhuT) + mmul(dLdn * dActdn * (r + drdh0), WhnT) + (*dLdh)*u );       // [bS,nUn]
 
-    dLdWxr.assign( mmul(xT, dSigdr * dLdr) );                                                               //  [iS,nUn]
-    dLdWhr.assign( mmul(h0T, dSigdr * dLdr) );                                                              //  [nUn,nUn]
+    // dLdWrx = dLdh * dhdWrx = (dLdh * dhdc) * dcdWrx = dLdc * dcdZc * dZcdWrx = dLdc * dcdZc * dZcdr * drdWrx =
+    //        = dLdc * dcdZc * dZcdr * drdZr * dZrdWrx = dLdr * drdZr * dZrdWrx
+    // dZrdWrx = xT × ...
+    // finally dLdWrx = xT × (dLdr * drdZr)
 
-    dLdWxu.assign( mmul(xT, dSigdu * dLdu) );                                                               //  [iS,nUn]
-    dLdWhu.assign( mmul(h0T, dSigdu * dLdu) );                                                              //  [nUn,nUn]
 
-    dLdWxn.assign( mmul(xT, dActdn * dLdn) );                                                               //  [iS,nUn]
-    dLdWhn.assign( mmul((r*(*h0)).transpose(), dActdn * dLdn) );                                               //  [nUn,nUn]
+    // dLdWrh = dLdh * dhdWrh = (dLdh * dhdc) * dcdWrh = dLdc * dcdZc * dZcdWrh = dLdc * dcdZc * dZcdr * drdWrh =
+    //        = dLdc * dcdZc * dZcdr * drdZr * dZrdWrh = dLdr * drdZr * dZrdWrh
+    // dZrdWrh = hLastT × ...
+    // finally dLdWrh = hLastT × (dLdr * drdZr)
 
-    dLdbr.assign( (dSigdr * dLdr).reduceAlongDims(reduce::Sum, {0}));                          // [nUn]
-    dLdbu.assign( (dSigdu * dLdu).reduceAlongDims(reduce::Sum, {0}));                          // [nUn]
-    dLdbn.assign( (dActdn * dLdn).reduceAlongDims(reduce::Sum, {0}));                          // [nUn]
 
-    if(dLdWx0 != nullptr)
-        *dLdWx += *dLdWx0;
+    // dLdWux = dLdh * dhdWux = (dLdh * dhdu) * dudWux = dLdu * dudZu * dZudWux
+    // dZudWux = xT × ...
+    // dLdu * dudZu * dZudWux = xT × (dLdu * dudZu)
 
-    if(dLdWh0 != nullptr)
-        *dLdWh += *dLdWh0;
 
-    if(dLdb0 != nullptr)
-        *dLdb += *dLdb0;
+    // dLdWuh = dLdh * dhdWuh = (dLdh * dhdu) * dudWuh = dLdh * dhdu * dudZu * dZudWuh = dLdu * dudZu * dZudWuh
+    // dZudWuh = hLastT × ...
+    // finally dLdWuh = hLastT × (dLdu * dudZu)
 
+
+    // dLdWcx = dLdh * dhdWcx = dLdh * dhdc * dcdWcx = (dLdh * dhdc) * dcdZc * dZcdWcx = dLdc * dcdZc * dZcdWcx
+    // dZcdWcx = xT × ...
+    // finally dLdWcx = xT × (dLdc * dcdZc)
+
+
+    // dLdWch = dLdh * dhdWch = dLdh * dhdc * dcdWch = (dLdh * dhdc) * dcdZc * dZcdWch = dLdc * dcdZc * dZcdWch
+    // dZcdWch = (r*hLast)^T × ...
+    // finally dLdWch = (r*hLast)^T × (dLdc * dcdZc)
+
+
+    // dLdbr = dLdh * dhdbr = (dLdh * dhdc) * dcdbr = dLdc * dcdbr = dLdc * dcdZc * dZcdbr = dLdc * dcdZc * dZcdr * drdbr =
+    //       = dLdr * drdZr * dZrdbr
+    // dZrdbr = 1
+    // finally dLdbr = dLdr * drdZr
+
+
+    // dLdbu = dLdh * dhdbu = (dLdh * dhdu) * dudbu = dLdu * dudZu * dZudbu
+    // dZudbu = 1
+    // finally dLdbu = dLdu * dudZu
+
+
+    // dLdbc = dLdh * dhdbc = (dLdh * dhdc) * dcdbc = dLdc * dcdZc * dZcdbc
+    // dZcdbc = 1
+    // finally dLdbc = dLdc * dcdZc
+
+    NDArray dhdc  = 1.f - u;           // [bS, nU]
+    NDArray dhdu  = *hLast - c;        // [bS, nU]
+    NDArray dudZu = u * dhdc;          // [bS, nU]
+    NDArray drdZr = r * (1.f - r);     // [bS, nU]
+    NDArray dcdZc = 1.f - c * c;       // [bS, nU]
+    NDArray dLdZc = *dLdc * dcdZc;     // [bS, nU]
+    NDArray dLdZu = *dLdu * dudZu;     // [bS, nU]
+    NDArray dLdZr = *dLdr * drdZr;     // [bS, nU]
+
+    // NDArray dLdc  = *dLdh * dhdc;                       // [bS, nU]
+    // NDArray dLdu  = *dLdh * dhdu;                       // [bS, nU]
+    // NDArray dLdr  = mmul(dLdc * dcdZc * *hLast, WchT);  // [bS, nU]
+
+    dLdx->assign(mmul(dLdZu, WuxT) + mmul(dLdZc, WcxT) + mmul(dLdZr, WrxT));                        // [bS, iS]
+
+    dLdhLast->assign(*dLdh * u + mmul(dLdZu, WuhT) + mmul(dLdZc * r, WchT) + mmul(dLdZr, WrhT));    // [bS, nU]
+
+    dLdWrx.assign(mmul(xT,     dLdZr));     // [iS, bS] × [bS, nU] = [iS, nU]
+    dLdWrh.assign(mmul(hLastT, dLdZr));     // [nU, bS] × [bS, nU] = [nU, nU]
+    dLdWux.assign(mmul(xT,     dLdZu));     // [iS, bS] × [bS, nU] = [iS, nU]
+    dLdWuh.assign(mmul(hLastT, dLdZu));     // [nU, bS] × [bS, nU] = [nU, nU]
+
+    dLdWcx.assign(mmul(xT, dLdZc));                          // [iS, bS] × [bS, nU] = [iS, nU]
+    dLdWch.assign(mmul((r * *hLast).transpose(), dLdZc));    // [nU, bS] × [bS, nU] = [nU, nU]
+
+    dLdbr.assign(dLdZr.reduceAlongDims(reduce::Sum, {0}));  // [nU]
+    dLdbu.assign(dLdZu.reduceAlongDims(reduce::Sum, {0}));  // [nU]
+
+    dLdbc->assign(dLdZc.reduceAlongDims(reduce::Sum, {0})); // [nU]
 }
 
 // //////////////////////////////////////////////////////////////////////////
@@ -255,34 +364,34 @@ void gruCellBP(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h
 // void gruTimeLoopBP(const std::vector<NDArray<T>*>& inArrs, const std::vector<NDArray<T>*>& outArrs) {
 
 //     NDArray<T>* x      = inArrs[0];                   // input [time, bS, iS]
-//     NDArray<T>* hi     = inArrs[1];                   // previous/initial cell output [bS, nUn],  that is at previous time step t-1
-//     NDArray<T>* Wx     = inArrs[2];                   // input-to-hidden  weights, [iS, 3*nUn]
-//     NDArray<T>* Wh     = inArrs[3];                   // hidden-to-hidden weights, [nUn, 3*nUn]
-//     NDArray<T>* b      = inArrs[4];                   // biases, [3*nUn]
-//     NDArray<T>* dLdh   = inArrs[5];                   // gradient wrt output, [time, bS, nUn], that is epsilon_next
+//     NDArray<T>* hi     = inArrs[1];                   // previous/initial cell output [bS, nU],  that is at previous time step t-1
+//     NDArray<T>* Wx     = inArrs[2];                   // input-to-hidden  weights, [iS, 3*nU]
+//     NDArray<T>* Wh     = inArrs[3];                   // hidden-to-hidden weights, [nU, 3*nU]
+//     NDArray<T>* b      = inArrs[4];                   // biases, [3*nU]
+//     NDArray<T>* dLdh   = inArrs[5];                   // gradient wrt output, [time, bS, nU], that is epsilon_next
 
 //     NDArray<T>* dLdx   = outArrs[0];                  // gradient wrt x,  [time, bS, iS], that is epsilon
-//     NDArray<T>* dLdhi  = outArrs[1];                  // gradient wrt hi, [bS, nUn]
-//     NDArray<T>* dLdWx  = outArrs[2];                  // gradient wrt Wx, [iS, 3*nUn]
-//     NDArray<T>* dLdWh  = outArrs[3];                  // gradient wrt Wh, [nUn, 3*nUn]
-//     NDArray<T>* dLdb   = outArrs[4];                  // gradient wrt b,  [3*nUn]
+//     NDArray<T>* dLdhi  = outArrs[1];                  // gradient wrt hi, [bS, nU]
+//     NDArray<T>* dLdWx  = outArrs[2];                  // gradient wrt Wx, [iS, 3*nU]
+//     NDArray<T>* dLdWh  = outArrs[3];                  // gradient wrt Wh, [nU, 3*nU]
+//     NDArray<T>* dLdb   = outArrs[4];                  // gradient wrt b,  [3*nU]
 
 //     const Nd4jLong time = x->sizeAt(0);
 //     const Nd4jLong bS   = x->sizeAt(1);
 //     const Nd4jLong iS   = x->sizeAt(2);
-//     const Nd4jLong nUn   = hi->sizeAt(1);
+//     const Nd4jLong nU   = hi->sizeAt(1);
 
-//     NDArray<T> h(hi->ordering(), {time, bS, nUn});      // feed forward output
+//     NDArray<T> h(hi->ordering(), {time, bS, nU});      // feed forward output
 
 //     // first step, time = 0, feed forward
 //     NDArray<T> x0 = (*x)({{0,1}, {}, {}});
-//     NDArray<T> h0 = h({{0,1}, {}, {}});
-//     helpers::gruCell<T>({&x0, hi, Wx, Wh, b}, &h0);
+//     NDArray<T> hLast = h({{0,1}, {}, {}});
+//     helpers::gruCell<T>({&x0, hi, Wx, Wh, b}, &hLast);
 
 //     // first step, time = 0, back prop
 //     NDArray<T> dLdx0 = (*dLdx)({{0,1}, {}, {}});
-//     NDArray<T> dLdh0 = (*dLdh)({{0,1}, {}, {}});
-//     helpers::gruCellBP<T>({&x0, hi, Wx, Wh, b, &dLdh0, nullptr, nullptr, nullptr}, {&dLdx0, dLdhi, dLdWx, dLdWh, dLdb});
+//     NDArray<T> dLdhLast = (*dLdh)({{0,1}, {}, {}});
+//     helpers::gruCellBP<T>({&x0, hi, Wx, Wh, b, &dLdhLast, nullptr, nullptr, nullptr}, {&dLdx0, dLdhi, dLdWx, dLdWh, dLdb});
 
 //     // loop through the rest time steps
 //     for (Nd4jLong t = time-1; t > 0; --t) {
@@ -310,4 +419,3 @@ void gruCellBP(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h
 }
 }
 }
-
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
index 28f301e5b..f90974a9f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_suppression.cpp
@@ -20,6 +20,8 @@
 
 #include <ops/declarable/helpers/image_suppression.h>
 //#include <blas/NDArray.h>
+#include <algorithm>
+#include <numeric>
 
 namespace nd4j {
 namespace ops {
@@ -28,9 +30,8 @@ namespace helpers {
     template <typename T>
     static void nonMaxSuppressionV2_(NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
         std::vector<Nd4jLong> indices(scales->lengthOf());
+        std::iota(indices.begin(), indices.end(), 0);
 
-        for (size_t i = 0; i < indices.size(); ++i)
-            indices[i] = i;
         std::sort(indices.begin(), indices.end(), [scales](int i, int j) {return scales->e<T>(i) > scales->e<T>(j);});
 
 //        std::vector<int> selected(output->lengthOf());
@@ -62,13 +63,15 @@ namespace helpers {
         };
 //        int numSelected = 0;
         int numBoxes = boxes->sizeAt(0);
+        int numSelected = 0;
 
-        for (int i = 0, numSelected = 0; i < numBoxes && numSelected < output->lengthOf(); ++i) {
-            bool shouldSelect = true;
+        for (int i = 0; i < numBoxes; ++i) {
+            bool shouldSelect = numSelected < output->lengthOf();
+            PRAGMA_OMP_PARALLEL_FOR //_ARGS(firstprivate(numSelected))
             for (int j = numSelected - 1; j >= 0; --j) {
+                if (shouldSelect)
                 if (needToSuppressWithThreshold(*boxes, indices[i], indices[selectedIndices[j]], T(threshold))) {
                     shouldSelect = false;
-                    break;
                 }
             }
             if (shouldSelect) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
index 6f3cb6aba..03f61d453 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sequence_mask.cpp
@@ -24,20 +24,20 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
-    template <typename T>
+    template <typename I, typename B>
     static void sequenceMask_(NDArray* input, NDArray* output, int maxIndex) {
         PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
         for (Nd4jLong i = 0; i < maxIndex; i++)
             for(Nd4jLong k = 0; k < input->lengthOf(); k++)
-                if (i < input->e<int>(k))
-                    output->p<T>(k * maxIndex + i,  T(1.0f));
+                if (i < input->t<I>(k))
+                    output->t<B>(k * maxIndex + i) = B(true); //,  T(1.0f));
     }
 
     void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), sequenceMask_, (input, output, maxIndex), LIBND4J_TYPES);
+        BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), sequenceMask_, (input, output, maxIndex), INTEGER_TYPES, BOOL_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void sequenceMask_, (NDArray* input, NDArray* output, int maxIndex), LIBND4J_TYPES);
+    BUILD_DOUBLE_TEMPLATE(template void sequenceMask_, (NDArray* input, NDArray* output, int maxIndex), INTEGER_TYPES, BOOL_TYPES);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index 5a132a5c6..2ea82a145 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -27,6 +27,7 @@
 #include <helpers/TAD.h>
 #include <helpers/ConstantTadHelper.h>
 #include <Loops.h>
+#include <graph/RandomGenerator.h>
 
 namespace nd4j 	  {
 namespace ops 	  {
@@ -81,7 +82,7 @@ static void trace_(const NDArray& input, NDArray& output) {
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-void randomShuffle_(NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace) {
+void randomShuffle_(NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
 
     // check edge cases first
     int temp;
@@ -95,16 +96,16 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::random::RandomBuffer&
 
         // apply Fisher-Yates shuffle
         if(isInplace) {
-            PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+            //PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
             for(int i = firstDim-1; i > 0; --i) {
-                int r = rng.nextInt(0, i);
+                int r = rng.relativeInt(i) % i;
                 if(i == r)
                     continue;
-                T _e0 = input.e<T>(i);
-                T _e1 = input.e<T>(r);
+                T t0 = input.t<T>(i);
+                T t1 = input.t<T>(r);
                 //math::nd4j_swap<T>(input(i), input(r));
-                input.p<T>(i, _e1);
-                input.p<T>(r, _e0);
+                input.t<T>(i) = t1;
+                input.t<T>(r) = t0;
             }
         }
         else {
@@ -113,12 +114,12 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::random::RandomBuffer&
             output.p<T>(Nd4jLong(0), input.e<T>(0));
             PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
             for(int i = firstDim-1; i > 0; --i) {
-                int r = rng.nextInt(0, i);
-                output.p(i, input.e<T>(indices[r]));
+                int r = rng.relativeInt(i) % i;
+                output.t<T>(i) = input.t<T>(indices[r]);
                 if(i == r)
                     continue;
 
-                output.p(r, input.e<T>(indices[i]));
+                output.t<T>(r) = input.t<T>(indices[i]);
                 math::nd4j_swap<int>(indices[i], indices[r]);
             }
             rng.rewindH(firstDim-1);
@@ -132,9 +133,10 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::random::RandomBuffer&
 
         // apply Fisher-Yates shuffle
         if(isInplace) {
-            PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold())
-            for(int i = firstDim-1; i > 0; --i) {
-                int r = rng.nextInt(0, i);
+            //PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold())
+            for(int i = firstDim - 1; i > 0; --i) {
+                int r = rng.relativeInt(i) % i;
+
                 if(i == r)
                     continue;
                 subArrsListIn->at(i)->swapUnsafe(*subArrsListIn->at(r));
@@ -146,9 +148,9 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::random::RandomBuffer&
             std::vector<int> indices(firstDim);
             std::iota(indices.begin(), indices.end(), 0);
             bool isZeroShuffled = false;
-            PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
-            for(int i = firstDim-1; i > 0; --i) {
-                int r = rng.nextInt(0, i);
+            //PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+            for(int i = firstDim - 1; i > 0; --i) {
+                int r = rng.relativeInt(i) % i;
                 subArrsListOut->at(i)->assign(subArrsListIn->at(indices[r]));
                 if(r == 0)
                     isZeroShuffled = true;
@@ -167,11 +169,11 @@ void randomShuffle_(NDArray& input, NDArray& output, nd4j::random::RandomBuffer&
 
 }
 
-    void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace) {
+    void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(input.dataType(), randomShuffle_, (input, output, rng, isInplace), LIBND4J_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void randomShuffle_, (NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void randomShuffle_, (NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace), LIBND4J_TYPES);
 
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
index 295e15c6d..def7d316f 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@@ -15,130 +15,209 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
+#include <PointersManager.h>
 
-namespace nd4j {
-namespace ops {
+namespace nd4j    {
+namespace ops     {
 namespace helpers {
 
-    template <typename T>
-    static void _CUDA_G adjustHueSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
-        int numChannels = 3;
-        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-        auto bIn = reinterpret_cast<T*>(xBuffer);
-        auto bOut = reinterpret_cast<T*>(zBuffer);
-        static const int kChannelRange = 6;
+///////////////////////////////////////////////////////////////////
+template <typename T>
+static void _CUDA_G adjustHueCuda(const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets,
+                                        void* vz, const Nd4jLong *zShapeInfo, const Nd4jLong* zTadOffsets,
+                                        const Nd4jLong numOfTads, const T delta, const int dimC) {
 
-        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
-            auto i = bIn + e * numChannels;
-            auto o = bOut + e * numChannels;
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
 
-            T h, v_min, v_max;
-            helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
+    __shared__ int rank;
+    __shared__ Nd4jLong xDimCstride, zDimCstride;
 
-            h += delta * kChannelRange;
-            while (h < (T) 0.)
-                h += (T) kChannelRange;
-
-            while (h >= (T) kChannelRange)
-                h -= (T) kChannelRange;
-
-            helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
-        }
+    if (threadIdx.x == 0) {
+        rank = shape::rank(xShapeInfo);
+        xDimCstride = shape::stride(xShapeInfo)[dimC];
+        zDimCstride = shape::stride(zShapeInfo)[dimC];
     }
 
-    template <typename T>
-    static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
-        int numChannels = 3;
-        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-        static const int kChannelRange = 6;
+    __syncthreads();
 
-        auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
-        auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
-        auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-        auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
-        auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
-        auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
+    for (Nd4jLong i = tid; i < numOfTads; i += gridDim.x * blockDim.x) {
 
+        const T* xTad = x + xTadOffsets[i];
+              T* zTad = z + zTadOffsets[i];
 
-        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
-            auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-            auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-            auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        T h, s, v;
 
-            auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-            auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-            auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
 
-            T h, v_min, v_max;
-            helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
+        h += delta * 360;
+        if(h > 360)
+            h -= 360;
+        else if(h < 0)
+            h += 360;
 
-            h += delta * kChannelRange;
-            while (h < (T) 0)
-                h += (T) kChannelRange;
-
-            while (h >= (T) kChannelRange)
-                h -= (T) kChannelRange;
-
-            helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
-        }
+        hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
     }
+}
 
-    template <typename T>
-    static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
-        // numChannels is always 3
-        auto tuples = array->lengthOf() / 3;
-        if (isNHWC) {
-            adjustHueSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
-        } else {
-            // TODO: check this one
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+///////////////////////////////////////////////////////////////////
+template<typename T>
+static _CUDA_H void adjustHueCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream,
+                                          const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets,
+                                                void* vz, const Nd4jLong* zShapeInfo, const Nd4jLong* zTadOffsets,
+                                          const Nd4jLong numOfTads, const NDArray* deltaScalarArr, const int dimC) {
 
-            auto tadLength = shape::length(packX.primaryShapeInfo());
+    adjustHueCuda<T><<<blocksPerGrid, threadsPerBlock, 256, *stream>>>(vx, xShapeInfo, xTadOffsets, vz, zShapeInfo, zTadOffsets, numOfTads, deltaScalarArr->e<T>(0), dimC);
+}
+BUILD_SINGLE_TEMPLATE(template void adjustHueCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets, void* vz, const Nd4jLong* zShapeInfo, const Nd4jLong* zTadOffsets, const Nd4jLong numOfTads, const NDArray* deltaScalarArr, const int dimC), LIBND4J_TYPES);
 
-            adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
-        }
+////////////////////////////////////////////////////////////////////////
+void adjustHue(nd4j::LaunchContext* context, const NDArray *input, const NDArray* deltaScalarArr, NDArray *output, const int dimC) {
+
+    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+
+    const Nd4jLong numOfTads = packX.numberOfTads();
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (numOfTads + threadsPerBlock - 1) / threadsPerBlock;
+
+    PointersManager manager(context, "adjustHue");
+
+    NDArray::prepareSpecialUse({output}, {input, deltaScalarArr});
+    BUILD_SINGLE_SELECTOR(input->dataType(), adjustHueCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), packX.platformOffsets(), output->specialBuffer(), output->specialShapeInfo(), packZ.platformOffsets(), numOfTads, deltaScalarArr, dimC), LIBND4J_TYPES);
+    NDArray::registerSpecialUse({output}, {input, deltaScalarArr});
+
+    manager.synchronize();
+}
+
+
+/*
+template <typename T>
+static void _CUDA_G adjustHueSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
+    int numChannels = 3;
+    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    auto bIn = reinterpret_cast<T*>(xBuffer);
+    auto bOut = reinterpret_cast<T*>(zBuffer);
+    static const int kChannelRange = 6;
+
+    for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+        auto i = bIn + e * numChannels;
+        auto o = bOut + e * numChannels;
+
+        T h, v_min, v_max;
+        helpers::rgb_to_hv(i[0], i[1], i[2], &h, &v_min, &v_max);
+
+        h += delta * kChannelRange;
+        while (h < (T) 0.)
+            h += (T) kChannelRange;
+
+        while (h >= (T) kChannelRange)
+            h -= (T) kChannelRange;
+
+        helpers::hv_to_rgb(h, v_min, v_max, o, o + 1, o + 2);
     }
+}
+
+template <typename T>
+static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
+    int numChannels = 3;
+    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+    static const int kChannelRange = 6;
+
+    auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
+    auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
+    auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+
+    auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
+    auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
+    auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
 
 
-    template <typename T>
-    static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
-        auto xType = array->dataType();
+    for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+        auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
 
-        // numChannels is always 3
-        auto tuples = array->lengthOf() / 3;
+        auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
 
-        if (isNHWC) {
-            // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
-            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
-        } else {
-            // TODO: check this one
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+        T h, v_min, v_max;
+        helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
 
-            auto tadLength = shape::length(packX.primaryShapeInfo());
+        h += delta * kChannelRange;
+        while (h < (T) 0)
+            h += (T) kChannelRange;
 
-            adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
-        }
+        while (h >= (T) kChannelRange)
+            h -= (T) kChannelRange;
+
+        helpers::hv_to_rgb(h, v_min, v_max, _ro, _go, _bo);
     }
+}
 
-    void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
-        auto xType = array->dataType();
+template <typename T>
+static void _adjust_hue_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    // numChannels is always 3
+    auto tuples = array->lengthOf() / 3;
+    if (isNHWC) {
+        adjustHueSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
+    } else {
+        // TODO: check this one
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
 
-        float d = delta->e<float>(0);
-        if (array->rankOf() == 4) {
-        } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
-        }
+        auto tadLength = shape::length(packX.primaryShapeInfo());
+
+        adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
     }
+}
 
+
+template <typename T>
+static void _adjust_hue_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    auto xType = array->dataType();
+
+    // numChannels is always 3
+    auto tuples = array->lengthOf() / 3;
+
+    if (isNHWC) {
+        // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
+        BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
+    } else {
+        // TODO: check this one
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+
+        auto tadLength = shape::length(packX.primaryShapeInfo());
+
+        adjustHueSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+    }
+}
+
+void _adjust_hue(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+    auto xType = array->dataType();
+
+    float d = delta->e<float>(0);
+    if (array->rankOf() == 4) {
+        BUILD_SINGLE_SELECTOR(xType, _adjust_hue_batch, (context, array, output, d, isNHWC);, FLOAT_TYPES);
+    } else {
+        BUILD_SINGLE_SELECTOR(xType, _adjust_hue_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
+    }
+}
+
+*/
 }
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
index 474b4de03..ce910a892 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@@ -15,121 +15,198 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <ops/declarable/helpers/adjust_saturation.h>
+#include <ops/declarable/helpers/adjust_hue.h>
 #include <helpers/ConstantTadHelper.h>
+#include <PointersManager.h>
 
 
-namespace nd4j {
-namespace ops {
+namespace nd4j    {
+namespace ops     {
 namespace helpers {
 
-    template <typename T>
-    static void _CUDA_G adjustSaturationSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
-        int numChannels = 3;
-        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-        auto bIn = reinterpret_cast<T*>(xBuffer);
-        auto bOut = reinterpret_cast<T*>(zBuffer);
-        static const int kChannelRange = 6;
+///////////////////////////////////////////////////////////////////
+template <typename T>
+static void _CUDA_G adjustSaturationCuda(const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets,
+                                               void* vz, const Nd4jLong *zShapeInfo, const Nd4jLong* zTadOffsets,
+                                        const Nd4jLong numOfTads, const T factor, const int dimC) {
 
-        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
-            auto i = bIn + e * numChannels;
-            auto o = bOut + e * numChannels;
+    const T* x = reinterpret_cast<const T*>(vx);
+          T* z = reinterpret_cast<T*>(vz);
 
-            T h, s, v;
-            // Convert the RGB color to Hue/V-range.
-            helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
-            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+    __shared__ int rank;
+    __shared__ Nd4jLong xDimCstride, zDimCstride;
 
-            // Convert the hue and v-range back into RGB.
-            helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
-        }
+    if (threadIdx.x == 0) {
+        rank = shape::rank(xShapeInfo);
+        xDimCstride = shape::stride(xShapeInfo)[dimC];
+        zDimCstride = shape::stride(zShapeInfo)[dimC];
     }
 
-    template <typename T>
-    static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
-        int numChannels = 3;
-        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-        static const int kChannelRange = 6;
+    __syncthreads();
 
-        auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
-        auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
-        auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
+    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-        auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
-        auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
-        auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
+    for (Nd4jLong i = tid; i < numOfTads; i += gridDim.x * blockDim.x) {
 
-        for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
-            auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-            auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-            auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        const T* xTad = x + xTadOffsets[i];
+              T* zTad = z + zTadOffsets[i];
 
-            auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-            auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-            auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        T h, s, v;
 
-            T h, s, v;
-            // Convert the RGB color to Hue/V-range.
-            helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
-            s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
-            // Convert the hue and v-range back into RGB.
-            helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
-        }
+        rgbToHsv<T>(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], h, s, v);
+
+        s *= factor;
+        if(s > 1.f)
+            s = 1.f;
+        else if(s < 0.f)
+            s = 0.f;
+
+        hsvToRgb<T>(h, s, v, zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
     }
+}
 
-    template <typename T>
-    static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
-        // numChannels is always 3
-        auto tuples = array->lengthOf() / 3;
+///////////////////////////////////////////////////////////////////
+template<typename T>
+static _CUDA_H void adjustSaturationCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream,
+                                          const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets,
+                                                void* vz, const Nd4jLong* zShapeInfo, const Nd4jLong* zTadOffsets,
+                                          const Nd4jLong numOfTads, const NDArray* factorScalarArr, const int dimC) {
 
-        if (isNHWC) {
-            adjustSaturationSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
-        } else {
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+    adjustSaturationCuda<T><<<blocksPerGrid, threadsPerBlock, 256, *stream>>>(vx, xShapeInfo, xTadOffsets, vz, zShapeInfo, zTadOffsets, numOfTads, factorScalarArr->e<T>(0), dimC);
+}
+BUILD_SINGLE_TEMPLATE(template void adjustSaturationCudaLauncher, (const int blocksPerGrid, const int threadsPerBlock, const cudaStream_t *stream, const void* vx, const Nd4jLong* xShapeInfo, const Nd4jLong* xTadOffsets, void* vz, const Nd4jLong* zShapeInfo, const Nd4jLong* zTadOffsets, const Nd4jLong numOfTads, const NDArray* factorScalarArr, const int dimC), LIBND4J_TYPES);
 
-            auto tadLength = shape::length(packX.primaryShapeInfo());
+////////////////////////////////////////////////////////////////////////
+void adjustSaturation(nd4j::LaunchContext* context, const NDArray *input, const NDArray* factorScalarArr, NDArray *output, const int dimC) {
 
-            adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
-        }
+    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(),  {dimC});
+    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {dimC});
+
+    const Nd4jLong numOfTads = packX.numberOfTads();
+
+    const int threadsPerBlock = MAX_NUM_THREADS / 2;
+    const int blocksPerGrid = (numOfTads + threadsPerBlock - 1) / threadsPerBlock;
+
+    PointersManager manager(context, "adjustSaturation");
+
+    NDArray::prepareSpecialUse({output}, {input, factorScalarArr});
+    BUILD_SINGLE_SELECTOR(input->dataType(), adjustSaturationCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), packX.platformOffsets(), output->specialBuffer(), output->specialShapeInfo(), packZ.platformOffsets(), numOfTads, factorScalarArr, dimC), LIBND4J_TYPES);
+    NDArray::registerSpecialUse({output}, {input, factorScalarArr});
+
+    manager.synchronize();
+}
+
+/*
+template <typename T>
+static void _CUDA_G adjustSaturationSingleNHWCKernel(void *xBuffer, Nd4jLong *xShapeInfo,  void *zBuffer, Nd4jLong *zShapeInfo, Nd4jLong tuples, float delta) {
+    int numChannels = 3;
+    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    auto bIn = reinterpret_cast<T*>(xBuffer);
+    auto bOut = reinterpret_cast<T*>(zBuffer);
+    static const int kChannelRange = 6;
+
+    for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+        auto i = bIn + e * numChannels;
+        auto o = bOut + e * numChannels;
+
+        T h, s, v;
+        // Convert the RGB color to Hue/V-range.
+        helpers::rgb_to_hsv(i[0], i[1], i[2], &h, &s, &v);
+        s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+
+        // Convert the hue and v-range back into RGB.
+        helpers::hsv_to_rgb(h, s, v, o, o + 1, o + 2);
     }
+}
 
-    template <typename T>
-    static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
-        auto xType = array->dataType();
+template <typename T>
+static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShapeInfo, Nd4jLong *xOffsets, void *zBuffer, Nd4jLong *zTadShapeInfo, Nd4jLong *zOffsets, Nd4jLong tadLength, Nd4jLong tuples, float delta) {
+    int numChannels = 3;
+    auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+    static const int kChannelRange = 6;
 
-        // numChannels is always 3
-        auto tuples = array->lengthOf() / 3;
+    auto bufferR = reinterpret_cast<T *>(xBuffer) + xOffsets[0];
+    auto bufferG = reinterpret_cast<T *>(xBuffer) + xOffsets[1];
+    auto bufferB = reinterpret_cast<T *>(xBuffer) + xOffsets[2];
 
-        if (isNHWC) {
-            // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
-            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
-        } else {
-            // TODO: check this one
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+    auto outputR = reinterpret_cast<T *>(zBuffer) + zOffsets[0];
+    auto outputG = reinterpret_cast<T *>(zBuffer) + zOffsets[1];
+    auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
 
-            auto tadLength = shape::length(packX.primaryShapeInfo());
+    for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
+        auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
 
-            adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
-        }
+        auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+
+        T h, s, v;
+        // Convert the RGB color to Hue/V-range.
+        helpers::rgb_to_hsv(_ri[0], _gi[0], _bi[0], &h, &s, &v);
+        s = nd4j::math::nd4j_min<T>((T) 1.0f, nd4j::math::nd4j_max<T>((T) 0.0f, s * delta));
+        // Convert the hue and v-range back into RGB.
+        helpers::hsv_to_rgb(h, s, v, _ro, _go, _bo);
     }
+}
 
-    void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
-        auto xType = array->dataType();
+template <typename T>
+static void _adjust_saturation_single(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    // numChannels is always 3
+    auto tuples = array->lengthOf() / 3;
 
-        float d = delta->e<float>(0);
-        if (array->rankOf() == 4) {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_batch, (context, array, output, d, isNHWC);, FLOAT_TYPES);
-        } else {
-            BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
-        }
+    if (isNHWC) {
+        adjustSaturationSingleNHWCKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), array->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), tuples, delta);
+    } else {
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {1, 2});
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {1, 2});
+
+        auto tadLength = shape::length(packX.primaryShapeInfo());
+
+        adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
     }
+}
 
+template <typename T>
+static void _adjust_saturation_batch(nd4j::LaunchContext * context, NDArray *array, NDArray *output, float delta, bool isNHWC) {
+    auto xType = array->dataType();
+
+    // numChannels is always 3
+    auto tuples = array->lengthOf() / 3;
+
+    if (isNHWC) {
+        // in case of nhwc batch, we don't really care about examples: it's still bunch of RGB values
+        BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, delta, isNHWC);, FLOAT_TYPES);
+    } else {
+        // TODO: check this one
+        auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(array->getShapeInfo(), {0, 2, 3});
+        auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {0, 2, 3});
+
+        auto tadLength = shape::length(packX.primaryShapeInfo());
+
+        adjustSaturationSingleNCHWKernel<T><<<256, 256, 1024, *context->getCudaStream()>>>(array->specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), output->specialBuffer(), packZ.platformShapeInfo(), packZ.platformOffsets(), tadLength, tuples, delta);
+    }
+}
+
+void adjust_saturation(nd4j::LaunchContext * context, NDArray *array, NDArray *output, NDArray* delta, bool isNHWC) {
+    auto xType = array->dataType();
+
+    float d = delta->e<float>(0);
+    if (array->rankOf() == 4) {
+        BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_batch, (context, array, output, d, isNHWC);, FLOAT_TYPES);
+    } else {
+        BUILD_SINGLE_SELECTOR(xType, _adjust_saturation_single, (context, array, output, d, isNHWC);, FLOAT_TYPES);
+    }
+}
+*/
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
index f261e67e0..952bf47c7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
@@ -22,20 +22,99 @@
 #include <NativeOps.h>
 #include <vector>
 #include <memory>
+#include <cuda_exception.h>
 
 namespace nd4j {
 namespace ops {
 namespace helpers {
 
     template <typename T>
-    static void dropoutSimple(NDArray const* input, NDArray* output, double probValue, int seed) {
+    static __global__ void dropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probVal, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+        __shared__ T const* input;
+        __shared__ T* output;
 
+        if (threadIdx.x == 0) {
+            input = reinterpret_cast<T const*>(inputBuf);
+            output = reinterpret_cast<T*>(outputBuf);
+        }
+
+        for (Nd4jLong e = 0; e < inLen; ++e) {
+            T val = nodeRng->relativeT(e, T(0.f), T(1.f));
+
+            if (double(val) < probVal)
+                output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
+        }
     }
-    BUILD_SINGLE_TEMPLATE(template void dropoutSimple, (NDArray const* input, NDArray* output, double probValue, int seed), FLOAT_TYPES);
+
+    template <typename T>
+    static void dropoutSimple(nd4j::LaunchContext* context, NDArray const* input, NDArray* output, double probValue, int seed) {
+        nd4j::graph::RandomGenerator nodeRng(3019L, seed);
+        int inLen = input->lengthOf();
+        nd4j::graph::RandomGenerator* dRandom;
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input});
+
+        auto err = cudaMalloc(&dRandom, sizeof(nd4j::graph::RandomGenerator));
+        if (err) {
+            throw cuda_exception::build("helpers::dropoutSimple: Cannot allocate device memory for random generator.", err);
+        }
+        err = cudaMemcpy(dRandom, &nodeRng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
+        if (err) {
+            throw cuda_exception::build("helpers::dropoutSimple: Cannot set up device memory for random generator.", err);
+        }
+
+        dropoutSimpleKernel<T><<<128, 256, 1024, *stream>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), probValue, inLen, dRandom);
+        err = cudaFree(dRandom);
+        if (err) {
+            throw cuda_exception::build("helpers::dropoutSimple: Cannot deallocate device memory for random generator.", err);
+        }
+        NDArray::registerSpecialUse({output}, {input});
+    }
+
+    BUILD_SINGLE_TEMPLATE(template void dropoutSimple, (nd4j::LaunchContext* context, NDArray const* input, NDArray* output, double probValue, int seed), FLOAT_TYPES);
 
     template <typename T>
     int _dropOutFunctor(graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
 
+        if (reduceShape == nullptr){
+            dropoutSimple<T>(context.launchContext(), input, output, probValue, seed);
+        }
+        else {
+            REQUIRE_TRUE(reduceShape->lengthOf() <= input->rankOf(), 0, "dropout: Noise shape should be fittable to input");
+
+            std::vector<Nd4jLong> dims(reduceShape->lengthOf());
+            reduceShape->syncToHost(); // to ensure that follows are actual
+            bool fit = true;
+//            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
+            for( int i = 0; i < dims.size(); i++ ) {
+                if (fit) {
+                    dims[i] = reduceShape->e<Nd4jLong>(i);
+                    for (int e = 0; e < input->rankOf(); ++e)
+                        if (fit)
+                            if (input->sizeAt(e) % dims[i]) {
+                                fit = false;
+                            }
+                }
+            }
+
+            // check dims to fit input
+            REQUIRE_TRUE(fit, 0, "dropout: Noise shape should fit to input rank.");
+            std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
+            chunk->assign(1.f);
+            //chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
+            //NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
+            dropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), probValue, seed);
+            // broadcast chunk to full matrix
+            std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
+            dropOutMultiplier->assign(1.f);
+
+            *dropOutMultiplier += *chunk;
+
+            output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
+        }
+
         return Status::OK();
     }
 
@@ -48,14 +127,121 @@ namespace helpers {
     BUILD_SINGLE_TEMPLATE(template int _dropOutFunctor, (graph::Context& context, NDArray* input, NDArray* output, NDArray* reduceShape, int seed, double probValue);, FLOAT_TYPES);
 
 /////////////////////////////////// backrpopagations ///////////////////////////////////////////////
+    template <typename T>
+    static __global__ void dropoutBPKernel(void* outputBuf, Nd4jLong* outputShape, void* gradOutBuf, Nd4jLong* gradOutShape, double probValue) {
+        __shared__ T* output;
+        __shared__ T* input;
+        __shared__ int len;
+
+        if (threadIdx.x == 0) {
+            len = shape::length(outputShape);
+            output = reinterpret_cast<T*>(outputBuf);
+            input = reinterpret_cast<T*>(gradOutBuf);
+        }
+
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+
+        for (int e = tid; e < len; e += step) {
+            if (output[shape::getIndexOffset(e, outputShape, len)] != T(0.))
+                output[shape::getIndexOffset(e, outputShape, len)] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);
+
+        }
+    }
     template <typename T>
     static int dropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
-        return Status::OK();
+        int res = dropOutFunctor(context, input, output, reduceShape, seed, probValue);
+        auto stream = context.launchContext()->getCudaStream();
+
+        if (ND4J_STATUS_OK == res)
+            dropoutBPKernel<T><<<128, 256, 1024, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(), probValue);
+
+        return res;
+    }
+
+    template <typename T>
+    static __global__ void alphaDropoutSimpleKernel(void const* inputBuf, Nd4jLong const* inputShape, void* outputBuf, Nd4jLong* outputShape, double probValue, double alpha, double alpha1, double beta, int inLen, nd4j::graph::RandomGenerator* nodeRng) {
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+        __shared__ T const* input;
+        __shared__ T* output;
+
+        if (threadIdx.x == 0) {
+            input = reinterpret_cast<T const*>(inputBuf);
+            output = reinterpret_cast<T*>(outputBuf);
+        }
+
+        for (auto e = tid; e < inLen; e += step) {
+            T val = nodeRng->relativeT(e, T(0.f), T(1.f));
+            T xVal = input[shape::getIndexOffset(e, inputShape, inLen)];
+            output[shape::getIndexOffset(e, outputShape, inLen)] = (val >= T(probValue) ? T(alpha * beta + alpha1) : T(alpha * (double)xVal + alpha1));
+        }
+    }
+    template <typename T>
+    static void alphaDropoutSimple(nd4j::LaunchContext* context, NDArray const* input, NDArray* output, int seed, double probValue, double alpha, double alpha1, double beta) {
+        nd4j::graph::RandomGenerator nodeRng(3019L, seed), *dRandom;
+        auto stream = context->getCudaStream();
+        auto err = cudaMalloc(&dRandom, sizeof(nd4j::graph::RandomGenerator));
+        NDArray::prepareSpecialUse({output}, {input});
+        if (err) {
+            throw cuda_exception::build("helpers::alphaDropoutSimple: Cannot allocate device memory for random generator.", err);
+        }
+        err = cudaMemcpy(dRandom, &nodeRng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
+        if (err) {
+            throw cuda_exception::build("helpers::alphaDropoutSimple: Cannot set up device memory for random generator.", err);
+        }
+
+        alphaDropoutSimpleKernel<T><<<128, 256, 1024, *stream>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), probValue, alpha, alpha1, beta, output->lengthOf(), dRandom);
+
+        err = cudaFree(dRandom);
+        if (err) {
+            throw cuda_exception::build("helpers::alphaDropoutSimple: Cannot deallocate device memory for random generator.", err);
+        }
+        NDArray::registerSpecialUse({output}, {input});
     }
 
     template <typename T>
     static int alphaDropOutFunctor_(graph::Context& context, NDArray* input, NDArray* output,
                             NDArray* reduceShape, int seed, double probValue, double alpha, double alpha1, double beta) {
+
+        if (reduceShape == nullptr){
+            alphaDropoutSimple<T>(context.launchContext(), input, output, seed, probValue, alpha, alpha1, beta);
+        }
+        else {
+            REQUIRE_TRUE(reduceShape->lengthOf() <= input->rankOf(), 0, "dropout: Noise shape should be fittable to input");
+
+            std::vector<Nd4jLong> dims(reduceShape->lengthOf());
+            reduceShape->syncToHost(); // to ensure that follows are actual
+            bool fit = true;
+//            PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(fit))
+            for( int i = 0; i < dims.size(); i++ ) {
+                if (fit) {
+                    dims[i] = reduceShape->e<Nd4jLong>(i);
+                    for (int e = 0; e < input->rankOf(); ++e)
+                        if (fit)
+                            if (input->sizeAt(e) % dims[i]) {
+                                fit = false;
+                            }
+                }
+            }
+
+            // check dims to fit input
+            REQUIRE_TRUE(fit, 0, "alpha_dropout: Noise shape should fit to input rank.");
+            std::unique_ptr<NDArray> chunk(new NDArray('c', dims, output->dataType(), context.launchContext()));
+            chunk->assign(1.f);
+            //chunk->applyRandom<randomOps::DropOutInverted<T>>(rng, nullptr, chunk.get(), &probValue);
+            //NativeOpExecutioner::execRandom(random::DropOutInverted, rng, chunk->buffer(), chunk->shapeInfo(), chunk->buffer(), chunk->shapeInfo(), &prob);
+            alphaDropoutSimple<T>(context.launchContext(), chunk.get(), chunk.get(), seed, probValue, alpha, alpha1, beta);
+            // broadcast chunk to full matrix
+            std::unique_ptr<NDArray> dropOutMultiplier(new NDArray(*input));
+            dropOutMultiplier->assign(1.f);
+
+            *dropOutMultiplier += *chunk;
+
+            output->assign(*input * *dropOutMultiplier); //input->applyPairwiseTransform(pairwise::Multiply, dropOutMultiplier.get(), output, nullptr);
+        }
+
+
         return Status::OK();
     }
 
@@ -63,7 +249,12 @@ namespace helpers {
     int alphaDropOutFunctorBP_(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output,
                               NDArray* reduceShape, int seed, double probValue, double alpha, double alpha1, double beta) {
 
-        return Status::OK();
+        int res = alphaDropOutFunctor(context, input, output, reduceShape, seed, probValue, alpha, alpha1, beta);
+        if (res == ND4J_STATUS_OK) {
+            (*output) *= alpha;
+            (*output) *= (*gradOut); //->applyPairwiseTransform<transform::Multiply>(gradOut, output, nullptr);
+        }
+        return res;
     }
 
     int dropOutFunctorBP(graph::Context& context, NDArray* input, NDArray* gradOut, NDArray* output, NDArray* reduceShape, int seed, double probValue) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gru.cu b/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
index 5d1d3ca2f..8e7b62a91 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gru.cu
@@ -35,58 +35,88 @@ namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wru, const NDArray* Wc,
-             const NDArray* bru, const NDArray* bc,
+void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc,
+             const NDArray* b, const NDArray* bc,
              NDArray* r, NDArray* u, NDArray* c, NDArray* h) {
 
     //Inputs:
-    // x        input [bS x inSize]
-    // hLast    previous cell output [bS x numUnits],  that is at previous time step t-1
-    // Wru      RU weights - [bS, 2*numUnits] - reset and update gates
-    // Wc       C weights - [bS, numUnits] - cell gate
-    // bru      r and u biases, [2*numUnits] - reset and update gates
-    // bc       c biases, [numUnits] - cell gate
+    // x        input [bS, iS], iS - input size
+    // hLast    previous cell output [bS, nU],  that is at previous time step t-1, nU - number of units
+    // W        RU weights - [iS+nU, 2*nU] - reset and update gates
+    // Wc       C weights - [iS+nU, nU] - cell gate
+    // b        r and u biases, [2*nU] - reset and update gates
+    // bc       c biases, [nU] - cell gate
 
     //Outputs:
-    // r        Reset gate output [bS, numUnits]
-    // u        Update gate output [bS, numUnits]
-    // c        Cell gate output [bS, numUnits]
-    // h        current cell output [bS, numUnits]
+    // r        Reset gate output [bS, nU]
+    // u        Update gate output [bS, nU]
+    // c        Cell gate output [bS, nU]
+    // h        current cell output [bS, nU]
 
-    const int nIn = x->sizeAt(1);
-    const int nU = hLast->sizeAt(1);                // number of units
+    /***************************************************************************************/
+    /************************ THIS IS NOT OPTIMAZED CODE ***********************************/
+    /** however it is more math-friendly and convenient for backprop formulas derivation) **/
 
-    //Concat inputs: [x, yt-1]: concat([bs,nIn],[bs,nOut]) -> [bs, (nIn+nOut)]
-    nd4j::ops::concat concatOp;
-    std::vector<NDArray*> inputs;
-    std::vector<double> targs;
-    std::vector<Nd4jLong> iargs({1});   //Axis = 1
-    std::vector<bool> bargs;
-    inputs.emplace_back(const_cast<NDArray*>(x));
-    inputs.emplace_back(const_cast<NDArray*>(hLast));
+    const int bS  = x->sizeAt(0);
+    const int iS = x->sizeAt(1);
+    const int nU = hLast->sizeAt(1);
 
-    auto result = concatOp.execute(inputs, targs, iargs, bargs);
-    auto concatOut = result->at(0);
+    NDArray Wrx = (*W)({0,iS,     0,nU});       // [iS, nU]
+    NDArray Wux = (*W)({0,iS,     nU,2*nU});    // [iS, nU]
+    NDArray Wrh = (*W)({iS,iS+nU, 0,nU});       // [nU, nU]
+    NDArray Wuh = (*W)({iS,iS+nU, nU,2*nU});    // [nU, nU]
 
-    //mmul/z for reset and update gates: (x * weight_ux + hLast * weight_xr + b_u)
-    auto m = mmul(*concatOut, *Wru);    //mmul: [bs, (nIn+numUnits)]* [(inSize+numUnits), 2*numUnits] = [bs, 4*numUnits]
-    m += (*bru);
+    NDArray Wcx = (*Wc)({0,iS,     0,0});       // reset cell weights    [iS, nU]
+    NDArray Wch = (*Wc)({iS,iS+nU, 0,0});       // updates cell weights  [nU, nU]
 
-    sigmoidInplace(m);  //sigmoid(rz) and sigmoid(uz)
-    auto mr = m({0,0, 0, nU});
-    auto mu = m({0,0, nU, 2*nU});
+    NDArray br = (*b)({0,  nU});                // [nU]
+    NDArray bu = (*b)({nU, 2*nU});              // [nU]
 
-    r->assign(&mr);
-    u->assign(&mu);
+    // × means matrix multipication
+    // * means element-wise product or so called Hadamard product
 
-    //Concatenated inputs: [x, yt-1 .* r]
-    auto yr = (*concatOut)({0,0, nIn, nIn+nU});
-    yr *= (*r);
+    // reset gate
+    r->assign(mmul(*x, Wrx) + mmul(*hLast, Wrh) + br);         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    r->applyTransform(transform::Sigmoid);
 
-    //c = tanh(x * weight_cx + (hLast .* r) * weight_cr + b_c)
-    MmulHelper::mmul(concatOut, const_cast<NDArray*>(Wc), c, 1.0, 0.0);       //c = 1.0 * concatOut * Wc + 0.0 * c
+    // update gate
+    u->assign(mmul(*x, Wux) + mmul(*hLast, Wuh) + bu);         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    u->applyTransform(transform::Sigmoid);
+
+    // cell gate c = activation(x × Wcx + (r * hlast) × Wch + bc)
+    c->assign(mmul(*x, Wcx) + mmul(*r * *hLast, Wch) + *bc);    // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    c->applyTransform(transform::Tanh);
+
+    NDArray temp = 1.f - *c * *c;
+
+    // cell output
+    h->assign(*u * *hLast + (1.f - *u) * *c);
+
+
+    /***************************************************************************************/
+    /*************** THIS IS MORE OPTIMAZED CODE (should think about concat) ***************/
+    /***************************************************************************************/
+/*
+    //Concat inputs: x + hLast : [bs, iS + nU]
+    NDArray xhConcat(x->ordering(), {bS, iS + nU}, x->dataType(), context);  // concat([bs, iS], [bs, nU]) -> [bs, iS + nU]
+    helpers::concat(context, {const_cast<NDArray*>(x), const_cast<NDArray*>(hLast)},  xhConcat, {1});
+
+    //mmul for reset and update gates: (x × weight_ux + hLast × weight_xr + b_u)
+    auto m = mmul(xhConcat, *W) + *b ;    // [bs, iS+nU] * [iS+nU, 2*nU] = [bs, 2*nU]
+    // m += *bru;
+
+    m.applyTransform(transform::Sigmoid);  //sigmoid(rz) and sigmoid(uz)
+
+    r->assign(m({0,0, 0, nU}));
+    u->assign(m({0,0, nU, 2*nU}));
+
+    // hLast = hLast * r
+    xhConcat({0,0, iS, iS+nU}) *= *r;
+
+    //c = tanh(x × weight_cx + (hLast * r) × weight_cr + b_c)
+    MmulHelper::mmul(&xhConcat, Wc, c, 1.0, 0.0);       //c = 1.0 * xhConcat * Wc + 0.0 * c
     *c += *bc;
-    tanhInplace(*c);
+    c->applyTransform(transform::Tanh);
 
     //Output: h = (1-u).*c + u .* hPrev
     //auto hResult = (*u) * (*hLast) + (1.0f - *u) * (*c); const_cast<NDArray*>(h)->assign(&hResult);
@@ -94,115 +124,238 @@ void gruCell(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLa
     auto temp = (1.0f - *u);
     temp *= (*c);
     (*h) += temp;
-
-    delete result;
+*/
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
+void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* hLast, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h) {
 
+    // x   input [time, bS, iS]
+    // hLast  initial cell output (at time step = 0) [bS, nU]
+    // Wx  input-to-hidden  weights, [iS, 3*nU]
+    // Wh  hidden-to-hidden weights, [nU, 3*nU]
+    // b   biases, [3*nU]
+
+    // h is cell outputs at each time step [time, bS, nU]
+
+    const int time = x->sizeAt(0);
+
+    NDArray ht_1(*hLast);
+
+    // loop through time steps
+    for (int t = 0; t < time; ++t) {
+
+        auto xt = (*x)({t,t+1, 0,0, 0,0});
+        auto ht = (*h)({t,t+1, 0,0, 0,0});
+
+        // helpers::gruCell(&xt, &ht_1, Wx, Wh, b, &ht);
+        // ht_1.assign(ht);
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////
-void gruCellBP(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* dLdh, const NDArray* dLdWx0,
-               const NDArray* dLdWh0, const NDArray* dLdb0, NDArray* dLdx, NDArray* dLdh0, NDArray* dLdWx, NDArray* dLdWh, NDArray* dLdb) {
+void gruCellBP(nd4j::LaunchContext* context,
+              const NDArray* x,    const NDArray* hLast,
+              const NDArray* W,    const NDArray* Wc,        const NDArray* b,    const NDArray* bc,
+              const NDArray* dLdr, const NDArray* dLdu,      const NDArray* dLdc, const NDArray* dLdh,
+                    NDArray* dLdx,       NDArray* dLdhLast,
+                    NDArray* dLdW,       NDArray* dLdWc,
+                    NDArray* dLdb,       NDArray* dLdbc) {
 
-    // x                        input [bS, iS]
-    // h0                       previous cell output [bS, nU],  that is at previous time step t-1
-    // Wx                       input-to-hidden  weights, [iS, 3*nU]
-    // Wh                       hidden-to-hidden weights, [nU, 3*nU]
-    // b                        biases, [3*nU]
-    // dLdh                     gradient wrt output, [bS,nU], that is epsilon_next
-    // dLdWx0                   gradient wrt Wx at previous time step, [iS, 3*nU]
-    // dLdWh0                   gradient wrt Wh at previous time step, [nU, 3*nU]
-    // dLdb0                    gradient wrt b at previous time step,  [3*nU]
+    //Inputs:
+    // x              input [bS, iS]
+    // hLast          previous cell output [bS, nU],  that is at previous time step t-1
+    // W              weights - [iS+nU, 2*nU] - reset and update gates
+    // Wc             C weights - [iS+nU, nU] - cell gate
+    // b              r and u biases, [2*nU] - reset and update gates
+    // bc             c biases, [nU] - cell gate
+    // dLdr           gradient wrt reset gate, [bS, nU]
+    // dLdu           gradient wrt update gate, [bS, nU]
+    // dLdc           gradient wrt cell state, [bS, nU]
+    // dLdh           gradient wrt current cell output, [bS, nU]
 
-    // dLdx                   gradient wrt x,  [bS, iS], that is epsilon
-    // dLdh0                  gradient wrt h0, [bS, nU]
-    // dLdWx                  gradient wrt Wx, [iS, 3*nU]
-    // dLdWh                  gradient wrt Wh, [nU, 3*nU]
-    // dLdb                   gradient wrt b at previous time step,  [3*nU]
+    //Outputs:
+    // dLdx           gradient wrt x,  [bS, iS],
+    // dLdhLast       gradient wrt hLast, [bS, nU]
+    // dLdW           gradient wrt W,  [iS+nU, 2*nU]
+    // dLdWc          gradient wrt Wc, [iS+nU, nU]
+    // dLdb           gradient wrt bru [2*nU]
+    // dLdbc          gradient wrt bc  [nU]
 
-    // h is current cell output [bS, nU], that is at current time step t
+    // * means element-wise product or so called Hadamard product
+    // × means matrix multiplication
+
+    /************************************************************************************************/
+    /******************************* THIS IS NOT OPTIMAZED CODE *************************************/
+    /*** aim is to have math-readable code in order to keep track of backprop formulas derivation ***/
+
+    const int bS  = x->sizeAt(0);
+    const int iS = x->sizeAt(1);
+    const int nU = hLast->sizeAt(1);
+
+    NDArray xT     = x->transpose();            // [iS, bS]
+    NDArray hLastT = hLast->transpose();        // [nU, bS]
+
+    NDArray Wrx = (*W)({0,iS,     0,nU});       // [iS, nU]
+    NDArray Wux = (*W)({0,iS,     nU,2*nU});    // [iS, nU]
+    NDArray Wrh = (*W)({iS,iS+nU, 0,nU});       // [nU, nU]
+    NDArray Wuh = (*W)({iS,iS+nU, nU,2*nU});    // [nU, nU]
+
+    NDArray Wcx = (*Wc)({0,iS,     0,0});       // reset cell weights    [iS, nU]
+    NDArray Wch = (*Wc)({iS,iS+nU, 0,0});       // updates cell weights  [nU, nU]
+
+    NDArray br = (*b)({0,  nU});                // [nU]
+    NDArray bu = (*b)({nU, 2*nU});              // [nU]
+
+    NDArray WrxT = Wrx.transpose();             // [nU, iS]
+    NDArray WuxT = Wux.transpose();             // [nU, iS]
+    NDArray WrhT = Wrh.transpose();             // [nU, nU]
+    NDArray WuhT = Wuh.transpose();             // [nU, nU]
+
+    NDArray WcxT = Wcx.transpose();             // [nU, iS]
+    NDArray WchT = Wch.transpose();             // [nU, nU]
+
+    NDArray dLdWrx = (*dLdW)({0,iS,     0,nU});     // [iS, nU]
+    NDArray dLdWux = (*dLdW)({0,iS,     nU,2*nU});  // [iS, nU]
+    NDArray dLdWrh = (*dLdW)({iS,iS+nU, 0,nU});     // [nU, nU]
+    NDArray dLdWuh = (*dLdW)({iS,iS+nU, nU,2*nU});  // [nU, nU]
+
+    NDArray dLdWcx = (*dLdWc)({0,iS,     0,0});     // [iS, nU]
+    NDArray dLdWch = (*dLdWc)({iS,iS+nU, 0,0});     // [nU, nU]
+
+    NDArray dLdbr = (*dLdb)({0,  nU});              // [nU]
+    NDArray dLdbu = (*dLdb)({nU, 2*nU});            // [nU]
 
-    const int nU = h0->sizeAt(1);
 
     // ***** feed forward step ***** //
-    // gates = sigmoid(x*Wx + h0*Wh + b)
-    auto gates = sigmoid(mmul(*x, (*Wx)({0,0, 0,2*nU})) + mmul(*h0, (*Wh)({0,0, 0,2*nU})) + (*b)({0,2*nU}));       // [bS, 2*nU] + [bS, 2*nU] + [1, 2*nU] = [bS, 2*nU]
+
     // reset gate
-    auto r = gates({0,0, 0, nU});               // [bS, nU]
+    NDArray r = mmul(*x, Wrx) + mmul(*hLast, Wrh) + br;         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    r.applyTransform(transform::Sigmoid);
+
     // update gate
-    auto u = gates({0,0, nU, 2*nU});            // [bS, nU]
-    // ◦ means element-wise product or so called Hadamard product
-    // n = tanh(x*Wx + (r◦h0)*Wh + b)
-    auto n = tanh(mmul(*x, (*Wx)({0,0, 2*nU,3*nU})) + mmul((*h0)*r, (*Wh)({0,0, 2*nU,3*nU})) + (*b)({2*nU,3*nU}));     // [bS, nU]
+    NDArray u = mmul(*x, Wux) + mmul(*hLast, Wuh) + bu;         // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    u.applyTransform(transform::Sigmoid);
+
+    // cell gate c = activation(x×Wcx + (r*hlast)×Wcu + bc)
+    NDArray c = mmul(*x, Wcx) + mmul(r * *hLast, Wch) + *bc;    // [bS, iS] × [iS, nU] + [bS, nU] × [nU, nU] + [nU] = [bS, nU]
+    c.applyTransform(transform::Tanh);
+
+    // h = (1 - u) * c + u * hPrev
+
 
     // ***** back prop step ***** //
-    auto Wxr  = (*Wx)({0,0, 0,   nU});
-    auto Wxu  = (*Wx)({0,0, nU,  2*nU});
-    auto Wxn  = (*Wx)({0,0, 2*nU,3*nU});
-    auto Whr  = (*Wh)({0,0, 0,   nU});
-    auto Whu  = (*Wh)({0,0, nU,  2*nU});
-    auto Whn  = (*Wh)({0,0, 2*nU,3*nU});
-    auto WxrT = Wxr.transpose();
-    auto WxuT = Wxu.transpose();
-    auto WxnT = Wxn.transpose();
-    auto WhrT = Whr.transpose();
-    auto WhuT = Whu.transpose();
-    auto WhnT = Whn.transpose();
-    auto xT   = x->transpose();
-    auto h0T  = h0->transpose();
 
-    auto dLdWxr = (*dLdWx)({0,0, 0,     nU});
-    auto dLdWxu = (*dLdWx)({0,0, nU,  2*nU});
-    auto dLdWxn = (*dLdWx)({0,0, 2*nU,3*nU});
+    // notations:
+    // Zr = x × Wrx + hLast × Wrh + br
+    // Zu = x × Wux + hLast × Wuh + bu
+    // Sr = sigmoid(Zr)
+    // Su = sigmoid(Zu)
+    // Zc = x × Wcx + (r * hlast) × Wch + bc
 
-    auto dLdWhr = (*dLdWh)({0,0, 0,     nU});
-    auto dLdWhu = (*dLdWh)({0,0, nU,  2*nU});
-    auto dLdWhn = (*dLdWh)({0,0, 2*nU,3*nU});
 
-    auto dLdbr = (*dLdb)({0,     nU});
-    auto dLdbu = (*dLdb)({nU,  2*nU});
-    auto dLdbn = (*dLdb)({2*nU,3*nU});
+    // dLdx = dLdh * dhdx = dLdh * (dhdu * dudx + dhdc * dcdx) = (dLdh * dhdu) * dudx + (dLdh * dhdc) * dcdx = dLdu * dudx + dLdc * dcdx
+    //      = dLdx_u + dLdx_c
+    // dLdx_u = dLdu * dudx = dLdu * dudZu * dZudx = |dZudx = ... × WuxT| = (dLdu * dudZu) × WuxT
+    // dLdx_c = dLdc * dcdx = dLdc * dcdZc * (dZcdx + dZcdr * drdx) = dLdc * dcdZc * dZcdx + dLdc * dcdZc * dZcdr * drdx = dLdx_c0 + dLdx_c1
+    // dLdx_c0 = dLdc * dcdZc * dZcdx = |dZcdx = ... × WcxT| = (dLdc * dcdZc) × WcxT
+    // dZcdr = (... * hLast) × WchT
+    // dLdc * dcdZc * dZcdr = dLdr = (dLdc * dcdZc * hLast) × WchT
+    // drdx = drdZr * dZrdx
+    // dZrdx = ... × WrxT
+    // dLdx_c1 = dLdc * dcdZc * dZcdr * drdx = dLdr * drdx = (dLdr * drdZr) × WrxT
+    // finally dLdx = dLdx_u + dLdx_c0 + dLdx_c1 = (dLdu * dudZu) × WuxT + (dLdc * dcdZc) × WcxT + (dLdr * drdZr) × WrxT
 
-    auto dhdu   = *h0  - n;              // [bS, nU]
-    auto dhdn   = 1.f - u;               // [bS, nU]
-    auto dSigdu = u * (1.f - u);         // [bS, nU]
-    auto dSigdr = r * (1.f - r);         // [bS, nU]
-    auto dActdn = 1.f - n * n;           // [bS, nU]
-    auto dndr   = mmul(dActdn * (*h0), WhnT);
-    auto drdh0  = mmul(dSigdr, WhrT);
 
-    auto dLdn = (*dLdh) * dhdn;
-    auto dLdu = (*dLdh) * dhdu;
-    auto dLdr = dLdn * dndr;
+    // dLdhLast    = dLdh * (dhdhLast + dhdu * dudhLast + dhdc * dcdhLast) = dLdh * dhdhLast + dLdu * dudhLast + dLdc * dcdhLast
+    //             = dLdhLast_h + dLdhLast_u + dLdhLast_c
+    // dLdhLast_h  = dLdh * dhdhLas = dLdh * u
+    // dLdhLast_u  = dLdu * dudhLast = |dudhLast = dudZu * dZudhLast , dZudhLast = ... × WuhT| = (dLdu * dudZu) × WuhT
+    // dLdhLast_c  = dLdc * dcdhLast  = dLdc * (dcdZc * dZcdhLast + dcdZc * dZcdr * drdhLast) =
+    //             = dLdc * dcdZc * dZcdhLast + dLdc * dcdZc * dZcdr * drdhLast =
+    //             = dLdc * dcdZc * dZcdhLast + dLdr * drdhLast = dLdhLast_c0 + dLdhLast_c1
+    // dLdhLast_c0 = dLdc * dcdZc * dZcdhLast = |dZcdhLast = (... * r) × WchT| = (dLdc * dcdZc * r) × WchT
+    // dLdhLast_c1 = dLdr * drdhLast = |drdhLast  = drdZr * dZrdhLast, dZrdhLast = ... × WrhT| = (dLdr * drdZr) × WrhT
+    // finally dLdhLast = dLdhLast_h + dLdhLast_u + dLdhLast_c0 + dLdhLast_c1 =
+    //                  = dLdh * u + (dLdu * dudZu) × WuhT + (dLdc * dcdZc * r) × WchT + (dLdr * drdZr) × WrhT
 
-    dLdx->assign( mmul(dLdu * dSigdu, WxuT) + mmul(dLdr * dSigdr, WxrT) + mmul(dLdn * dActdn, WxnT) );      // [bS,iS]
-    dLdh0->assign( mmul(dLdu * dSigdu, WhuT) + mmul(dLdn * dActdn * (r + drdh0), WhnT) + (*dLdh)*u );       // [bS,nU]
 
-    dLdWxr.assign( mmul(xT, dSigdr * dLdr) );                                                               //  [iS,nU]
-    dLdWhr.assign( mmul(h0T, dSigdr * dLdr) );                                                              //  [nU,nU]
+    // dLdWrx = dLdh * dhdWrx = (dLdh * dhdc) * dcdWrx = dLdc * dcdZc * dZcdWrx = dLdc * dcdZc * dZcdr * drdWrx =
+    //        = dLdc * dcdZc * dZcdr * drdZr * dZrdWrx = dLdr * drdZr * dZrdWrx
+    // dZrdWrx = xT × ...
+    // finally dLdWrx = xT × (dLdr * drdZr)
 
-    dLdWxu.assign( mmul(xT, dSigdu * dLdu) );                                                               //  [iS,nU]
-    dLdWhu.assign( mmul(h0T, dSigdu * dLdu) );                                                              //  [nU,nU]
 
-    dLdWxn.assign( mmul(xT, dActdn * dLdn) );                                                               //  [iS,nU]
-    dLdWhn.assign( mmul((r*(*h0)).transpose(), dActdn * dLdn) );                                               //  [nU,nU]
+    // dLdWrh = dLdh * dhdWrh = (dLdh * dhdc) * dcdWrh = dLdc * dcdZc * dZcdWrh = dLdc * dcdZc * dZcdr * drdWrh =
+    //        = dLdc * dcdZc * dZcdr * drdZr * dZrdWrh = dLdr * drdZr * dZrdWrh
+    // dZrdWrh = hLastT × ...
+    // finally dLdWrh = hLastT × (dLdr * drdZr)
 
-    dLdbr.assign( (dSigdr * dLdr).reduceAlongDims(reduce::Sum, {0}));                          // [nU]
-    dLdbu.assign( (dSigdu * dLdu).reduceAlongDims(reduce::Sum, {0}));                          // [nU]
-    dLdbn.assign( (dActdn * dLdn).reduceAlongDims(reduce::Sum, {0}));                          // [nU]
 
-    if(dLdWx0 != nullptr)
-        *dLdWx += *dLdWx0;
+    // dLdWux = dLdh * dhdWux = (dLdh * dhdu) * dudWux = dLdu * dudZu * dZudWux
+    // dZudWux = xT × ...
+    // dLdu * dudZu * dZudWux = xT × (dLdu * dudZu)
 
-    if(dLdWh0 != nullptr)
-        *dLdWh += *dLdWh0;
 
-    if(dLdb0 != nullptr)
-        *dLdb += *dLdb0;
+    // dLdWuh = dLdh * dhdWuh = (dLdh * dhdu) * dudWuh = dLdh * dhdu * dudZu * dZudWuh = dLdu * dudZu * dZudWuh
+    // dZudWuh = hLastT × ...
+    // finally dLdWuh = hLastT × (dLdu * dudZu)
 
+
+    // dLdWcx = dLdh * dhdWcx = dLdh * dhdc * dcdWcx = (dLdh * dhdc) * dcdZc * dZcdWcx = dLdc * dcdZc * dZcdWcx
+    // dZcdWcx = xT × ...
+    // finally dLdWcx = xT × (dLdc * dcdZc)
+
+
+    // dLdWch = dLdh * dhdWch = dLdh * dhdc * dcdWch = (dLdh * dhdc) * dcdZc * dZcdWch = dLdc * dcdZc * dZcdWch
+    // dZcdWch = (r*hLast)^T × ...
+    // finally dLdWch = (r*hLast)^T × (dLdc * dcdZc)
+
+
+    // dLdbr = dLdh * dhdbr = (dLdh * dhdc) * dcdbr = dLdc * dcdbr = dLdc * dcdZc * dZcdbr = dLdc * dcdZc * dZcdr * drdbr =
+    //       = dLdr * drdZr * dZrdbr
+    // dZrdbr = 1
+    // finally dLdbr = dLdr * drdZr
+
+
+    // dLdbu = dLdh * dhdbu = (dLdh * dhdu) * dudbu = dLdu * dudZu * dZudbu
+    // dZudbu = 1
+    // finally dLdbu = dLdu * dudZu
+
+
+    // dLdbc = dLdh * dhdbc = (dLdh * dhdc) * dcdbc = dLdc * dcdZc * dZcdbc
+    // dZcdbc = 1
+    // finally dLdbc = dLdc * dcdZc
+
+    NDArray dhdc  = 1.f - u;           // [bS, nU]
+    NDArray dhdu  = *hLast - c;        // [bS, nU]
+    NDArray dudZu = u * dhdc;          // [bS, nU]
+    NDArray drdZr = r * (1.f - r);     // [bS, nU]
+    NDArray dcdZc = 1.f - c * c;       // [bS, nU]
+    NDArray dLdZc = *dLdc * dcdZc;     // [bS, nU]
+    NDArray dLdZu = *dLdu * dudZu;     // [bS, nU]
+    NDArray dLdZr = *dLdr * drdZr;     // [bS, nU]
+
+    // NDArray dLdc  = *dLdh * dhdc;                       // [bS, nU]
+    // NDArray dLdu  = *dLdh * dhdu;                       // [bS, nU]
+    // NDArray dLdr  = mmul(dLdc * dcdZc * *hLast, WchT);  // [bS, nU]
+
+    dLdx->assign(mmul(dLdZu, WuxT) + mmul(dLdZc, WcxT) + mmul(dLdZr, WrxT));                        // [bS, iS]
+
+    dLdhLast->assign(*dLdh * u + mmul(dLdZu, WuhT) + mmul(dLdZc * r, WchT) + mmul(dLdZr, WrhT));    // [bS, nU]
+
+    dLdWrx.assign(mmul(xT,     dLdZr));     // [iS, bS] × [bS, nU] = [iS, nU]
+    dLdWrh.assign(mmul(hLastT, dLdZr));     // [nU, bS] × [bS, nU] = [nU, nU]
+    dLdWux.assign(mmul(xT,     dLdZu));     // [iS, bS] × [bS, nU] = [iS, nU]
+    dLdWuh.assign(mmul(hLastT, dLdZu));     // [nU, bS] × [bS, nU] = [nU, nU]
+
+    dLdWcx.assign(mmul(xT, dLdZc));                          // [iS, bS] × [bS, nU] = [iS, nU]
+    dLdWch.assign(mmul((r * *hLast).transpose(), dLdZc));    // [nU, bS] × [bS, nU] = [nU, nU]
+
+    dLdbr.assign(dLdZr.reduceAlongDims(reduce::Sum, {0}));  // [nU]
+    dLdbu.assign(dLdZu.reduceAlongDims(reduce::Sum, {0}));  // [nU]
+
+    dLdbc->assign(dLdZc.reduceAlongDims(reduce::Sum, {0})); // [nU]
 }
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
index 8eacc0c38..54abc7d71 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hashcode.cu
@@ -20,12 +20,111 @@
 
 #include <ops/declarable/helpers/hashcode.h>
 
+
 namespace nd4j {
     namespace ops {
         namespace helpers {
-            void hashCode(LaunchContext *context, NDArray &array, NDArray &result) {
+            template <typename T>
+            static __global__ void splitBufferToChuncks(T* buffer, Nd4jLong* tempBuffer, Nd4jLong numBlocks, Nd4jLong blockSize, Nd4jLong length) {
+
+                for (int b = blockIdx.x; b < numBlocks; b += gridDim.x) {
+                    auto blockBuffer = buffer + b * numBlocks;
+
+                    Nd4jLong r = 1;
+                    for (int e = threadIdx.x; e < blockSize && e + (b * numBlocks) < length; e += blockDim.x) {
+                        auto v = longBytes<T>(blockBuffer[e]);
+                        r = 31 * r + v;
+                    }
+
+                    tempBuffer[b] = r;
+                }
+            }
+
+            template <typename T>
+            static __global__ void internalHash(Nd4jLong* tempBuffer, Nd4jLong* tempResult, Nd4jLong numBlocks, Nd4jLong blockSize, Nd4jLong lastLength) {
+
+                for (int b = blockIdx.x; b < numBlocks; b += gridDim.x) {
+                    auto blockBuffer = tempBuffer + b * numBlocks;
+
+                    Nd4jLong r = 1;
+                    for (int e = threadIdx.x; e < blockSize && e + (b * numBlocks) < lastLength; e += blockDim.x) {
+                        auto v = longBytes<T>(blockBuffer[e]);
+                        r = 31 * r + v;
+                    }
+
+                    tempResult[b] = r;
+                }
 
             }
+
+
+            static __global__ void lastStep(Nd4jLong* resultBuf, Nd4jLong* tempBufferA, Nd4jLong* tempResult, Nd4jLong length, Nd4jLong blockSize) {
+                if (threadIdx.x == 0) {
+
+                    if (length <= blockSize)
+                        *resultBuf = *tempBufferA;
+                    else
+                        *resultBuf = *tempResult;
+                }
+            }
+
+            template <typename T>
+            void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) {
+                auto blockSize = 32;
+                auto stream = context->getCudaStream();
+                array.syncToDevice();
+
+                NDArray::prepareSpecialUse({&result}, {&array});
+                auto length = array.lengthOf();
+                int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1);
+                auto tempA = NDArrayFactory::create<Nd4jLong>('c', {numBlocks}, context);
+                auto tempB = NDArrayFactory::create<Nd4jLong>('c', { numBlocks / blockSize + 1}, context);
+
+                auto buffer = reinterpret_cast<T*>(array.specialBuffer()); //bufferAsT<T>();
+                auto tempBufferA = reinterpret_cast<Nd4jLong*>(tempA.specialBuffer()); //bufferAsT<Nd4jLong>();
+                auto tempBufferB = reinterpret_cast<Nd4jLong*>(tempB.specialBuffer()); //bufferAsT<Nd4jLong>();
+
+                // default buffer is the first one, because it might be the last one in case of small arrays (< blockSize)
+                auto tempBuffer = tempBufferA;
+                auto tempResult = tempBufferB;
+
+                // we divide array into 32 element chunks, and store intermediate results once
+                splitBufferToChuncks<T><<<numBlocks, length, 1024, *stream>>>(buffer, tempBuffer, numBlocks, blockSize, length);
+
+                // we replace pointer with intermediate one, and repeat only one chunk left
+                int iterationCount = 0;
+                while (numBlocks > 1) {
+                    int lastLength = numBlocks;
+                    numBlocks = lastLength / blockSize + ((lastLength % blockSize == 0) ? 0 : 1);
+
+
+                    internalHash<Nd4jLong><<<numBlocks, lastLength, 1024, *stream>>>(tempBuffer, tempResult, numBlocks, blockSize, lastLength);
+
+
+                    iterationCount++;
+                    // swapping buffers
+                    if (iterationCount % 2 == 0) {
+                        tempBuffer = tempBufferA;
+                        tempResult = tempBufferB;
+                    } else {
+                        tempBuffer = tempBufferB;
+                        tempResult = tempBufferA;
+                    }
+                }
+
+                //lastStep<Nd4jLong><<<1,1,128, *stream>>>(result.specialBuffer(), tempBufferA, tempResult, length, blockSize);
+                tempA.syncToHost();
+                tempB.syncToHost();
+                result.assign((length <= blockSize?tempA.e(0) : tempB.e(0)));
+
+                NDArray::registerSpecialUse({&result}, {&array});
+            }
+
+            void hashCode(LaunchContext *context, NDArray &array, NDArray &result) {
+                BUILD_SINGLE_SELECTOR(array.dataType(), hashCode_, (context, array, result), LIBND4J_TYPES);
+            }
+
+            BUILD_SINGLE_TEMPLATE(template void hashCode_, (LaunchContext* context, NDArray& array, NDArray& result), LIBND4J_TYPES);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
index f221f4771..2faf44106 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
@@ -20,6 +20,8 @@
 
 #include <ops/declarable/helpers/image_suppression.h>
 #include <NDArrayFactory.h>
+#include <NativeOps.h>
+#include <cuda_exception.h>
 
 namespace nd4j {
 namespace ops {
@@ -35,15 +37,16 @@ namespace helpers {
         Nd4jLong next1[] = {nextIndex, 1};
         Nd4jLong next2[] = {nextIndex, 2};
         Nd4jLong next3[] = {nextIndex, 3};
-
-        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous0, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous2, 2)]);
-        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous1, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous3, 2)]);
-        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous0, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous2, 2)]);
-        T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous1, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), previous3, 2)]);
-        T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next0, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next2, 2)]);
-        T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next1, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next3, 2)]);
-        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next0, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next2, 2)]);
-        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next1, 2)], boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), next3, 2)]);
+        Nd4jLong* shapeOf = shape::shapeOf(boxesShape);
+        Nd4jLong* strideOf = shape::stride(boxesShape);
+        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, previous0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous2, 2)]);
+        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, previous1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous3, 2)]);
+        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, previous0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous2, 2)]);
+        T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, previous1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous3, 2)]);
+        T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, next0, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next2, 2)]);
+        T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, next1, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next3, 2)]);
+        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, next0, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next2, 2)]);
+        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, next1, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next3, 2)]);
 
         T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
         T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext);
@@ -62,149 +65,101 @@ namespace helpers {
     };
 
     template <typename T, typename I>
-    static __global__ void nonMaxSuppressionKernel(T* boxes, Nd4jLong* boxesShape, I* indices, int* selectedIndices, Nd4jLong numBoxes, I* output, Nd4jLong* outputShape, T threshold) {
-        __shared__ Nd4jLong outputLen;
-
+    static __global__ void shouldSelectKernel(T* boxesBuf, Nd4jLong* boxesShape, I* indexBuf, I* selectedIndicesData, double threshold, int numSelected, int i, bool* shouldSelect) {
+        auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = gridDim.x * blockDim.x;
+        __shared__ bool shouldSelectShared;
         if (threadIdx.x == 0) {
-            outputLen = shape::length(outputShape);
+            shouldSelectShared = shouldSelect[0];
         }
         __syncthreads();
-
-        auto numSelected = blockIdx.x;
-        auto start = blockIdx.x * blockDim.x + threadIdx.x;
-        auto step = blockDim.x * gridDim.x;
-//        for (int numSelected = blockIdx.x; numSelected < outputLen; numSelected += gridDim.x) {
-        for (int i = start; i < numBoxes; i += step) {
-                bool shouldSelect = true;
-                for (int j = numSelected - 1; shouldSelect && j >= 0; --j) {
-                    if (needToSuppressWithThreshold<T>(boxes, boxesShape, indices[i], indices[selectedIndices[j]], threshold)) {
-                        shouldSelect = false;
-                    }
-                }
-
-                if (shouldSelect) {
-                    auto zPos = shape::getIndexOffset(numSelected, outputShape, outputLen);
-                    output[zPos] = indices[i];
-                    selectedIndices[numSelected] = i;
-                }
-
+        for (int j = numSelected - 1 - tid; j >= 0; j -= step) {
+            if (shouldSelectShared) {
+                if (needToSuppressWithThreshold(boxesBuf, boxesShape, indexBuf[i],
+                                                                  indexBuf[selectedIndicesData[j]], T(threshold)))
+                    shouldSelectShared = false;
+            }
+        }
+        __syncthreads();
+        if (threadIdx.x == 0) {
+            *shouldSelect = shouldSelectShared;
         }
     }
+    template <typename I>
 
-    template <typename T, typename I>
-    static __global__ void sortIndices(I* indices, Nd4jLong* indexShape, T* scores, Nd4jLong* scoreShape) {
-        __shared__ Nd4jLong len;
-//        __shared__ Nd4jLong* sortedPart;
-//        __shared__ Nd4jLong part;
-//        __shared__ Nd4jLong partSize;
-
+    static __global__ void copyIndices(void* indices,  void* indicesLong, Nd4jLong len) {
+        __shared__ I* indexBuf;
+        __shared__ Nd4jLong* srcBuf;
         if (threadIdx.x == 0) {
-//            blocksPerArr = (gridDim.x + numOfArrs - 1) / numOfArrs;     // ceil
-//            part = blockIdx.x / blocksPerArr;
-
-            len = shape::length(indexShape);
-//            __shared__ Nd4jLong* shmem = shared[];
-//            sortedPart = shmem;
+            indexBuf = reinterpret_cast<I*>(indices);
+            srcBuf = reinterpret_cast<Nd4jLong*>(indicesLong);
         }
+        auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+        auto step = blockDim.x * gridDim.x;
 
-        for (int m = 0; m < len; m++) {
-            if (m % 2 == 0) {
-                for (int tid = threadIdx.x; tid < len; tid += blockDim.x) {
-                    auto top = 2 * tid + 1;
-                    if (top < len) {
-                        auto t0 = shape::getIndexOffset(top - 1, indexShape, len);
-                        auto t1 = shape::getIndexOffset(top, indexShape, len);
-                        auto z0 = shape::getIndexOffset(top - 1, scoreShape, len);
-                        auto z1 = shape::getIndexOffset(top, scoreShape, len);
-
-                        if (scores[t0] < scores[t1]) {
-                            // swap indices first
-                            Nd4jLong di0 = indices[t0];
-                            indices[t0] = indices[t1];
-                            indices[t1] = di0;
-
-                            //swap scores next
-//                            T dz0 = scores[z0];
-//                            scores[z0] = scores[z1];
-//                            scores[z1] = dz0;
-                        }
-                    }
-                }
-            } else {
-                for (int tid = threadIdx.x; tid < len; tid += blockDim.x) {
-                    auto top = 2 * tid + 2;
-                    if (top < len) {
-                        auto t0 = shape::getIndexOffset(top - 1, indexShape, len);
-                        auto t1 = shape::getIndexOffset(top, indexShape, len);
-                        auto z0 = shape::getIndexOffset(top - 1, scoreShape, len);
-                        auto z1 = shape::getIndexOffset(top, scoreShape, len);
-
-                        if (scores[t0] < scores[t1]) {
-                            // swap indices first
-                            Nd4jLong di0 = indices[t0];
-                            indices[t0] = indices[t1];
-                            indices[t1] = di0;
-
-                            //swap scores next
-//                            T dz0 = scores[z0];
-//                            scores[z0] = scores[z1];
-//                            scores[z1] = dz0;
-                        }
-                    }
-                }
-            }
-            __syncthreads();
-        }
+        for (auto i = tid; i < len; i += step)
+            indexBuf[i] = (I)srcBuf[i];
     }
 
     template <typename T, typename I>
     static void nonMaxSuppressionV2_(nd4j::LaunchContext* context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
         auto stream = context->getCudaStream();
         NDArray::prepareSpecialUse({output}, {boxes, scales});
-        NDArray* indices = NDArrayFactory::create_<I>('c', {scales->lengthOf()}); // - 1, scales->lengthOf()); //, scales->getContext());
+        std::unique_ptr<NDArray> indices(NDArrayFactory::create_<I>('c', {scales->lengthOf()})); // - 1, scales->lengthOf()); //, scales->getContext());
         indices->linspace(0);
+        indices->syncToDevice(); // linspace only on CPU, so sync to Device as well
+
         NDArray scores(*scales);
-        indices->syncToHost(); //linspace(0);
-        I* indexBuf = reinterpret_cast<I*>(indices->specialBuffer());
-        T* scoreBuf = reinterpret_cast<T*>(scores.specialBuffer());
-        sortIndices<T, I><<<1, 32, 128, *stream>>>(indexBuf, indices->specialShapeInfo(), scoreBuf, scores.specialShapeInfo());
+        NativeOps nativeOps;
+
+        Nd4jPointer extras[2] = {nullptr, stream};
+
+        nativeOps.sortByValue(extras, indices->buffer(), indices->shapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), scores.buffer(), scores.shapeInfo(), scores.specialBuffer(), scores.specialShapeInfo(), true);
         // TO DO: sort indices using scales as value row
         //std::sort(indices.begin(), indices.end(), [scales](int i, int j) {return scales->e<T>(i) > scales->e<T>(j);});
-        indices->tickWriteDevice();
-        indices->syncToHost();
-        indices->printIndexedBuffer("AFTERSORT OUTPUT");
-        NDArray selected = NDArrayFactory::create<int>({output->lengthOf()});
+        I* indexBuf = reinterpret_cast<I*>(indices->specialBuffer());
 
-        NDArray selectedIndices = NDArrayFactory::create<int>({output->lengthOf()});
+        NDArray selectedIndices = NDArrayFactory::create<I>('c', {output->lengthOf()});
         int numSelected = 0;
         int numBoxes = boxes->sizeAt(0);
         T* boxesBuf = reinterpret_cast<T*>(boxes->specialBuffer());
-//        Nd4jLong* indicesData = reinterpret_cast<Nd4jLong*>(indices->specialBuffer());
-//        int* selectedData = reinterpret_cast<int*>(selected.specialBuffer());
-        int* selectedIndicesData = reinterpret_cast<int*>(selectedIndices.specialBuffer());
+
+        I* selectedIndicesData = reinterpret_cast<I*>(selectedIndices.specialBuffer());
         I* outputBuf = reinterpret_cast<I*>(output->specialBuffer());
-        nonMaxSuppressionKernel<T, I><<<output->lengthOf(), 512, 1024, *stream>>>(boxesBuf, boxes->specialShapeInfo(), indexBuf, selectedIndicesData, numBoxes, outputBuf, output->specialShapeInfo(), T(threshold));
-        NDArray::registerSpecialUse({output}, {boxes, scales});
-//        for (int i = 0; i < boxes->sizeAt(0); ++i) {
-//            if (selected.size() >= output->lengthOf()) break;
-//            bool shouldSelect = true;
-//            // Overlapping boxes are likely to have similar scores,
-//            // therefore we iterate through the selected boxes backwards.
-//            for (int j = numSelected - 1; j >= 0; --j) {
-//                if (needToSuppressWithThreshold(*boxes, indices[i], indices[selectedIndices[j]], T(threshold)) {
-//                    shouldSelect = false;
-//                    break;
-//                }
-//            }
-//            if (shouldSelect) {
-//                selected.push_back(indices[i]);
-//                selectedIndices[numSelected++] = i;
-//            }
-//        }
-//        for (size_t e = 0; e < selected.size(); ++e)
-//            output->p<int>(e, selected[e]);
-//
-        delete indices;
+
+        bool* shouldSelectD;
+        auto err = cudaMalloc(&shouldSelectD, sizeof(bool));
+        if (err) {
+            throw cuda_exception::build("helpers::nonMaxSuppressionV2: Cannot allocate memory for bool flag", err);
+        }
+        for (I i = 0; i < boxes->sizeAt(0); ++i) {
+            bool shouldSelect = numSelected < output->lengthOf();
+            if (shouldSelect) {
+                err = cudaMemcpy(shouldSelectD, &shouldSelect, sizeof(bool), cudaMemcpyHostToDevice);
+                if (err) {
+                    throw cuda_exception::build("helpers::nonMaxSuppressionV2: Cannot set up bool flag to device", err);
+                }
+
+                shouldSelectKernel<T> <<< 128, 256, 1024, *stream >>>
+                                                           (boxesBuf, boxes->specialShapeInfo(), indexBuf, selectedIndicesData, threshold, numSelected, i, shouldSelectD);
+                err = cudaMemcpy(&shouldSelect, shouldSelectD, sizeof(bool), cudaMemcpyDeviceToHost);
+                if (err) {
+                    throw cuda_exception::build("helpers::nonMaxSuppressionV2: Cannot set up bool flag to host", err);
+                }
+            }
+
+            if (shouldSelect) {
+                cudaMemcpy(reinterpret_cast<I*>(output->specialBuffer()) + numSelected, indexBuf + i, sizeof(I), cudaMemcpyDeviceToDevice);
+                cudaMemcpy(selectedIndicesData + numSelected, &i, sizeof(I), cudaMemcpyHostToDevice);
+                numSelected++;
+            }
+        }
+
+        err = cudaFree(shouldSelectD);
+        if (err) {
+            throw cuda_exception::build("helpers::nonMaxSuppressionV2: Cannot deallocate memory for bool flag", err);
+        }
+
     }
 
     void nonMaxSuppressionV2(nd4j::LaunchContext * context, NDArray* boxes, NDArray* scales, int maxSize, double threshold, NDArray* output) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
index 49028784a..79c290c28 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@@ -32,24 +32,24 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
-    template <typename T>
-    static __device__ void swapRows_(T* matrix, Nd4jLong* shape, int theFirst, int theSecond, Nd4jLong N) {
-        if (theFirst != theSecond) {
-            auto start = threadIdx.x + blockIdx.x * blockDim.x;
-            auto step = blockDim.x * gridDim.x;
-            for (auto i = start; i < N; i += step) {
-                Nd4jLong iCoord1[] = {theFirst, i};
-                Nd4jLong iCoord2[] = {theSecond, i};
-                auto iIndex1 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord1, 2);
-                auto iIndex2 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord2, 2);
-                //atomicExch(&matrix[iIndex1], matrix[iIndex2]);
-                T e0 = matrix[iIndex1];
-                T e1 = matrix[iIndex2];
-                matrix[iIndex1] = e0;
-                matrix[iIndex2] = e1;
-            }
-        }
-    }
+//    template <typename T>
+//    static __device__ void swapRows_(T* matrix, Nd4jLong* shape, int theFirst, int theSecond, Nd4jLong N) {
+//        if (theFirst != theSecond) {
+//            auto start = threadIdx.x + blockIdx.x * blockDim.x;
+//            auto step = blockDim.x * gridDim.x;
+//            for (auto i = start; i < N; i += step) {
+//                Nd4jLong iCoord1[] = {theFirst, i};
+//                Nd4jLong iCoord2[] = {theSecond, i};
+//                auto iIndex1 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord1, 2);
+//                auto iIndex2 = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), iCoord2, 2);
+//                //atomicExch(&matrix[iIndex1], matrix[iIndex2]);
+//                T e0 = matrix[iIndex1];
+//                T e1 = matrix[iIndex2];
+//                matrix[iIndex1] = e0;
+//                matrix[iIndex2] = e1;
+//            }
+//        }
+//    }
 //    BUILD_SINGLE_TEMPLATE(template void swapRows_, (NDArray* matrix, int theFirst, int theSecond), FLOAT_TYPES);
 //
 //    void swapRows(NDArray* matrix, int theFirst, int theSecond) {
@@ -71,9 +71,14 @@ namespace helpers {
 
         for (int i = start + 1; i < n; i += step) {
             Nd4jLong pos[] = {i, i - 1};
+            Nd4jLong posX[] = {i, i};
+            Nd4jLong posY[] = {i - 1, i - 1};
             auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
+            auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
+            auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
             auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
-            inverted[zIndex] = -input[xIndex];
+            inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]);
+//            math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]);
         }
     }
 
@@ -91,10 +96,11 @@ namespace helpers {
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
 
-        for (int i = start + 1; i < n; i += step) {
+        for (int i = start; i < n; i += step) {
             Nd4jLong pos[] = {i, i};
             auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
             auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
+//            math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]);
             inverted[zIndex] /= input[xIndex];
         }
     }
@@ -113,16 +119,16 @@ namespace helpers {
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
 
-        for (int i = start + 1; i < n - 1; i += step) {
+        for (int i = start; i < n - 1; i += step) {
             Nd4jLong pos[] = {i, i + 1};
-            Nd4jLong posY[] = {i, i};
+            //Nd4jLong posY[] = {i, i};
             Nd4jLong posX[] = {i + 1, i + 1};
             auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
-            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
+//            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
 //            auto yIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
             auto iIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posX, 2);
             auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
-            inverted[zIndex] -= input[xIndex] * inverted[iIndex] / input[yIndex];
+            math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex]); // / input[yIndex]);
             //inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i)
         }
     }
@@ -142,16 +148,18 @@ namespace helpers {
 //        auto step = blockDim.x * gridDim.x;
 
         for (int i = blockIdx.x + 2; i < n; i += gridDim.x) {
-            for (int j = i - 2; j > -1; --j)
-                for (int k = threadIdx.x; k < i; k+= blockDim.x) {
+            for (int j = i - 2; j >= 0; --j)
+                for (int k = threadIdx.x; k < i; k += blockDim.x) {
                     Nd4jLong posZ[] = {i, j};
-                    Nd4jLong posX[] = {k, j};
-                    Nd4jLong posY[] = {i, k};
+                    Nd4jLong posY[] = {k, j};
+                    Nd4jLong posX[] = {i, k};
+                    Nd4jLong posD[] = {i, i};
 
                     auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
                     auto yIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posY, 2);
+                    auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
                     auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ, 2);
-                    inverted[zIndex] -= inverted[yIndex] * input[xIndex];
+                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], - inverted[yIndex] * input[xIndex] / input[dIndex]);
                 }
         }
     }
@@ -176,13 +184,13 @@ namespace helpers {
                     Nd4jLong posZ[] = {i, j};
                     Nd4jLong posY[] = {k, j};
                     Nd4jLong posX[] = {i, k};
-                    Nd4jLong posD[] = {i, i};
+//                    Nd4jLong posD[] = {i, i};
 
                     auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
                     auto yIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posY, 2);
-                    auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
+  //                  auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
                     auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ, 2);
-                    inverted[zIndex] -= inverted[yIndex] * input[xIndex] / input[dIndex];
+                    math::atomics::nd4j_atomicAdd(&inverted[zIndex], - inverted[yIndex] * input[xIndex]);// / input[dIndex]);
                 }
         }
     }
@@ -196,14 +204,18 @@ namespace helpers {
         LaunchContext* context = inputMatrix->getContext();
         auto stream = context->getCudaStream();
 
+        // invert main diagonal
+        upvertKernel<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+        // invert the second diagonal
         invertKernelLow<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+//        invertKernelLow<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
         invertLowKernel<T><<<n, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void invertLowerMatrix_, (NDArray* inputMatrix, NDArray* invertedMatrix);, FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void invertLowerMatrix_, (NDArray* inputMatrix, NDArray* invertedMatrix);, FLOAT_NATIVE);
 
     void invertLowerMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) {
-        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (inputMatrix, invertedMatrix), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertLowerMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE);
     }
 
     template <typename T>
@@ -215,58 +227,58 @@ namespace helpers {
             return;
         }
 
-        upvertKernel<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
+        //upvertKernel<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
         upvertKernelUp<T><<<1, n, 128, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
         invertUpKernel<T><<<n, n, 256, *stream>>>(invertedMatrix->specialBuffer(), invertedMatrix->specialShapeInfo(), inputMatrix->specialBuffer(), inputMatrix->specialShapeInfo(), n);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void invertUpperMatrix_, (NDArray* inputMatrix, NDArray* invertedMatrix);, FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void invertUpperMatrix_, (NDArray* inputMatrix, NDArray* invertedMatrix);, FLOAT_NATIVE);
 
     void invertUpperMatrix(NDArray* inputMatrix, NDArray* invertedMatrix) {
-        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertUpperMatrix_, (inputMatrix, invertedMatrix), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(inputMatrix->dataType(), invertUpperMatrix_, (inputMatrix, invertedMatrix), FLOAT_NATIVE);
     }
 
-    template <typename T>
-    static __global__ void lupKernel(T* compound, Nd4jLong* compoundShape, T* permutation, Nd4jLong* permutationShape, Nd4jLong rowNum) {
-        int swapCount = 0;
-        for(int i = blockIdx.x; i < rowNum; i += gridDim.x ) {
-            auto pivotValue = T(0.0);
-            auto pivot = -1;
-
-            for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
-                Nd4jLong rowCoord[] = {rowCounter, i};
-                auto rowPos = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), rowCoord, 2);
-                if(nd4j::math::nd4j_abs(compound[rowPos]) > pivotValue ) {
-                    pivotValue = nd4j::math::nd4j_abs(compound[rowPos]);
-                    pivot = rowCounter;
-                }
-            }
-
-            if( pivotValue != T(0.0) ) {
-                swapRows_<T>(compound, compoundShape, pivot, i, rowNum);
-                swapRows_<T>(permutation, permutationShape, pivot, i, rowNum);
-                if (pivot != i)
-                    swapCount++;
-
-                for( int j = i + 1; j < rowNum; j++ ) {
-                    Nd4jLong posJIbuf[] = {j, i};
-                    Nd4jLong posIIbuf[] = {i, i};
-                    auto posJI = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJIbuf, 2);
-                    auto posII = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIIbuf, 2);
-
-                    compound[posJI] /= compound[posII];
-                    for( int k = i + 1; k < rowNum; k++ ) {
-                        Nd4jLong posJKbuf[] = {j, k};
-                        Nd4jLong posIKbuf[] = {i, k};
-                        auto posJK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJKbuf, 2);
-                        auto posIK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIKbuf, 2);
-                        T arg = compound[posJI] * compound[posIK];
-                        compound[posJK] -= arg;
-                    }
-                }
-            }
-        }
-    }
+//    template <typename T>
+//    static __global__ void lupKernel(T* compound, Nd4jLong* compoundShape, T* permutation, Nd4jLong* permutationShape, Nd4jLong rowNum) {
+//        int swapCount = 0;
+//        for(int i = blockIdx.x; i < rowNum; i += gridDim.x ) {
+//            auto pivotValue = T(0.0);
+//            auto pivot = -1;
+//
+//            for(int rowCounter = i; rowCounter < rowNum; rowCounter++ ) {
+//                Nd4jLong rowCoord[] = {rowCounter, i};
+//                auto rowPos = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), rowCoord, 2);
+//                if(nd4j::math::nd4j_abs(compound[rowPos]) > pivotValue ) {
+//                    pivotValue = nd4j::math::nd4j_abs(compound[rowPos]);
+//                    pivot = rowCounter;
+//                }
+//            }
+//
+//            if( pivotValue != T(0.0) ) {
+//                swapRows_<T>(compound, compoundShape, pivot, i, rowNum);
+//                swapRows_<T>(permutation, permutationShape, pivot, i, rowNum);
+//                if (pivot != i)
+//                    swapCount++;
+//
+//                for( int j = i + 1; j < rowNum; j++ ) {
+//                    Nd4jLong posJIbuf[] = {j, i};
+//                    Nd4jLong posIIbuf[] = {i, i};
+//                    auto posJI = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJIbuf, 2);
+//                    auto posII = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIIbuf, 2);
+//
+//                    compound[posJI] /= compound[posII];
+//                    for( int k = i + 1; k < rowNum; k++ ) {
+//                        Nd4jLong posJKbuf[] = {j, k};
+//                        Nd4jLong posIKbuf[] = {i, k};
+//                        auto posJK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posJKbuf, 2);
+//                        auto posIK = shape::getOffset(0, shape::shapeOf(compoundShape), shape::stride(compoundShape), posIKbuf, 2);
+//                        T arg = compound[posJI] * compound[posIK];
+//                        compound[posJK] -= arg;
+//                    }
+//                }
+//            }
+//        }
+//    }
 
     template <typename T, typename F>
     static __global__ void determinantKernel(T* compound, T* result, Nd4jLong len) {
@@ -332,6 +344,30 @@ namespace helpers {
             matrix[j] = (F)inputBuf[xIndex];
         }
     }
+
+    template <typename T, typename F>
+    static __global__ void returnMatrix(void* output, Nd4jLong* outputShape, void* input, Nd4jLong* inputShape, Nd4jLong pos, Nd4jLong rowLen) {
+        __shared__ F* matrix;
+        __shared__ T* outputBuf;
+        __shared__ Nd4jLong outputLen;
+        __shared__ Nd4jLong n2;
+
+        if (threadIdx.x == 0) {
+            matrix = reinterpret_cast<F*>(input);
+            outputBuf = reinterpret_cast<T*>(output);
+            outputLen = shape::length(inputShape);
+            n2 = rowLen * rowLen;
+        }
+        __syncthreads();
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = blockDim.x * gridDim.x;
+
+        for (int k = pos + start, j = start; j < n2; k += step, j += step) {
+            auto zIndex = shape::getIndexOffset(k, outputShape, outputLen);
+            outputBuf[zIndex] = (T)matrix[j];
+        }
+    }
+
     template <typename F>
     static __global__ void fillUpPermutation(void* output, Nd4jLong* shape, int* source, int rowNum) {
         __shared__ F* permutation;
@@ -462,7 +498,7 @@ namespace helpers {
                             d_work,
                             permutationBuf,
                             d_info);
-                    fillUpPermutation<float><<<n, n, 128, *stream>>>(permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n);
+                    fillUpPermutation<T><<<n, n, 128, *stream>>>(permutation->specialBuffer(), permutation->specialShapeInfo(), permutationBuf, n);
                     permutation->tickWriteDevice();
                 }
                 err = cudaFree(d_work);
@@ -483,7 +519,7 @@ namespace helpers {
 //        NDArray::registerSpecialUse({input}, {input});
         input->tickWriteDevice();
     }
-    BUILD_SINGLE_TEMPLATE(template void lup_, (LaunchContext* context, NDArray* input, NDArray* output, NDArray* permutation), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void lup_, (LaunchContext* context, NDArray* input, NDArray* output, NDArray* permutation), FLOAT_NATIVE);
 
     template <typename T>
     static int determinant_(nd4j::LaunchContext* context, NDArray* input, NDArray* output) {
@@ -504,32 +540,32 @@ namespace helpers {
         output->assign(1.f);
         for (int e = 0; e < output->lengthOf(); e++) {
             Nd4jLong pos = e * n2;
-            if (matrix.dataType() == input->dataType())
+//            if (matrix.dataType() == input->dataType())
                 fillMatrix<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
-            else
-                fillMatrix<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
+//            else
+//                fillMatrix<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
 
-            if (matrix.dataType() == input->dataType())
-                lup_<T>(context, &matrix, nullptr, nullptr);
-            else
-                lup_<float>(context, &matrix, nullptr, nullptr);
+//            if (matrix.dataType() == input->dataType())
+              lup_<T>(context, &matrix, nullptr, nullptr);
+//            else
+//                lup_<float>(context, &matrix, nullptr, nullptr);
             auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf());
             auto inputBuf = reinterpret_cast<T*>(matrix.specialBuffer());
             auto outputBuf = reinterpret_cast<T*>(output->specialBuffer()) + offset;
-            if (matrix.dataType() == input->dataType())
-                determinantKernel<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
-            else
-                determinantKernel<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
+//            if (matrix.dataType() == input->dataType())
+            determinantKernel<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
+//            else
+//                determinantKernel<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
         }
         NDArray::registerSpecialUse({output}, {input});
 
         return Status::OK();
     }
 
-    BUILD_SINGLE_TEMPLATE(template int determinant_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int determinant_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output), FLOAT_NATIVE);
 
     int determinant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), return determinant_, (context, input, output), FLOAT_NATIVE);
     }
 
     template <typename T>
@@ -552,22 +588,22 @@ namespace helpers {
         output->assign(1.f);
         for (int e = 0; e < output->lengthOf(); e++) {
             Nd4jLong pos = e * n2;
-            if (matrix.dataType() == input->dataType())
-                fillMatrix<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
-            else
-                fillMatrix<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
+//            if (matrix.dataType() == input->dataType())
+            fillMatrix<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
+//            else
+//                fillMatrix<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), pos, n);
 
-            if (matrix.dataType() == input->dataType())
+//            if (matrix.dataType() == input->dataType())
                 lup_<T>(context, &matrix, nullptr, nullptr);
-            else
-                lup_<float>(context, &matrix, nullptr, nullptr);
+//            else
+//                lup_<float>(context, &matrix, nullptr, nullptr);
             auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf());
             auto inputBuf = reinterpret_cast<T*>(matrix.specialBuffer());
             auto outputBuf = reinterpret_cast<T*>(output->specialBuffer()) + offset;
-            if (matrix.dataType() == input->dataType())
+//            if (matrix.dataType() == input->dataType())
                 determinantLogKernel<T, T><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
-            else
-                determinantLogKernel<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
+//            else
+//                determinantLogKernel<T, float><<<launchDims.x, launchDims.y, launchDims.z, *stream >>> (inputBuf, outputBuf, n);
         }
         NDArray::registerSpecialUse({output}, {input});
 
@@ -576,10 +612,10 @@ namespace helpers {
         return ND4J_STATUS_OK;
     }
 
-    BUILD_SINGLE_TEMPLATE(template int logAbsDeterminant_, (LaunchContext* context, NDArray* input, NDArray* output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int logAbsDeterminant_, (LaunchContext* context, NDArray* input, NDArray* output), FLOAT_NATIVE);
 
     int logAbsDeterminant(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), return logAbsDeterminant_, (context, input, output), FLOAT_NATIVE);
     }
 
     template <typename T>
@@ -597,10 +633,12 @@ namespace helpers {
 
         if (threadIdx.x == 0) {
             xShapeOf = shape::shapeOf(lowerShape);
-            yShapeOf = shape::shapeOf(upperShape);
-            zShapeOf = shape::shapeOf(matrixShape);
             xStrideOf = shape::stride(lowerShape);
+
+            yShapeOf = shape::shapeOf(upperShape);
             yStrideOf = shape::stride(upperShape);
+
+            zShapeOf = shape::shapeOf(matrixShape);
             zStrideOf = shape::stride(matrixShape);
             lowerMatrix = reinterpret_cast<T*>(lowerBuf);
             upperMatrix = reinterpret_cast<T*>(upperBuf);
@@ -610,15 +648,16 @@ namespace helpers {
 
         for (int k = blockIdx.x; k < n; k += gridDim.x) {  // and then put all values under main diagonal on to it
             for (int j = threadIdx.x; j < n; j += blockDim.x) {
-                Nd4jLong posX[] = {j, k};
-
+                Nd4jLong posX[] = {k, j};
+                Nd4jLong posD[] = {j, j};
                 auto xPos = shape::getOffset(0, xShapeOf, xStrideOf, posX, 2);
                 auto yPos = shape::getOffset(0, yShapeOf, yStrideOf, posX, 2);
-                auto pos =  shape::getOffset(0, zShapeOf, zStrideOf, posX, 2);
-                if (k <= j)
-                    lowerMatrix[xPos] = matrix[pos];//(k, j);
+                auto iPos = shape::getOffset(0, zShapeOf, zStrideOf, posX, 2);
+                auto dPos = shape::getOffset(0, zShapeOf, zStrideOf, posD, 2);
+                if (k >= j)
+                    lowerMatrix[xPos] = matrix[iPos];//(k, j);
                 else
-                    upperMatrix[yPos] = matrix[pos]; //k, j);
+                    upperMatrix[yPos] = matrix[iPos]; //k, j);
             }
         }
     }
@@ -639,38 +678,26 @@ namespace helpers {
         auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), {output->rankOf() - 2, output->rankOf() - 1});
         auto stream = context->getCudaStream();
 
+//        PRAGMA_OMP_PARALLEL_FOR
         for (auto i = 0LL; i < packX.numberOfTads(); i++) {
-            fillMatrix<T, float><<<1, n2, 128, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n);
-            permutation.assign(0.f);
-            lup_<float>(context, &matrix, &compound, &permutation);
+            fillMatrix<T, T><<<1, n2, 128, *stream>>>(matrix.specialBuffer(), matrix.specialShapeInfo(), input->specialBuffer(), input->specialShapeInfo(), i * n2, n);
             matrix.tickWriteDevice();
-            permutation.tickWriteDevice();
-            permutation.printIndexedBuffer("PERMUTE");
-            lower.setIdentity(); // set up U to identity matrix
-            upper.setIdentity();
-            fillLowerUpperKernel<float><<<1, n2, 128>>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), matrix.specialBuffer(), matrix.specialShapeInfo(), n);
-            lower.tickWriteDevice();
-            upper.tickWriteDevice();
-            invertUpperMatrix(&upper, &matrix);
-            invertLowerMatrix(&lower, &upper);
-            lower.tickWriteDevice();
-            upper.tickWriteDevice();
-            lower.printIndexedBuffer("LOWER");
-            upper.printIndexedBuffer("UPPER");
+            compound.assign(matrix);
+            lup_<T>(context, &compound, nullptr, nullptr);
+            fillLowerUpperKernel<T><<<n, n, 128>>>(lower.specialBuffer(), lower.specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), compound.specialBuffer(), compound.specialShapeInfo(), n);
+            matrix.assign(0);
+            invertUpperMatrix(&upper, &matrix); // U^{-1}
+            compound.assign(0);
+            invertLowerMatrix(&lower, &compound); // L{-1}
 
-            nd4j::MmulHelper::mmul(&matrix, &upper, &compound, 1.0, 0.0);
-            nd4j::MmulHelper::mmul(&compound, &permutation, &matrix, 1.0, 0.0);
-//            for (int k = e * n2, row = 0; k < (e + 1) * n2; k++) {
-//                output->t<T>(k) = matrix.template t<T>(row++);
-//            }
+            nd4j::MmulHelper::mmul(&matrix, &compound, &upper, 1.0, 0.0);
+            returnMatrix<T, T><<<1, n2, 128, *stream>>>(output->specialBuffer(), output->specialShapeInfo(), upper.specialBuffer(), upper.specialShapeInfo(), i * n2, n);
         }
-
-
         return Status::OK();
     }
 
     int inverse(nd4j::LaunchContext * context, NDArray* input, NDArray* output) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), return inverse_, (context, input, output), FLOAT_NATIVE);
     }
 
     bool checkCholeskyInput(nd4j::LaunchContext * context, NDArray const* input) {
@@ -803,7 +830,7 @@ namespace helpers {
         return cholesky_(context, input, output, inplace);
     }
 //    BUILD_SINGLE_TEMPLATE(template int cholesky_, (LaunchContext* context, NDArray* input, NDArray* output, bool inplace), FLOAT_TYPES);
-    BUILD_SINGLE_TEMPLATE(template int inverse_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output), FLOAT_TYPES);
+    BUILD_SINGLE_TEMPLATE(template int inverse_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output), FLOAT_NATIVE);
 
     __global__ void logDetKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong batchNum, Nd4jLong* tadShape, Nd4jLong* tadOffsets, void* outputBuf, Nd4jLong* outputShape) {
         __shared__ double* output;
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
index deedbc706..fab9577d6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
@@ -143,7 +143,7 @@ namespace helpers {
 
     ///////////////////////////////////////////////////////////////////
     template <typename T>
-    static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
+    static void reverseSequence_(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim){
         int posOfNonUnityDim = -1;
         seqLengths->syncToHost();
         auto stream = context->getCudaStream();
@@ -193,7 +193,7 @@ namespace helpers {
     }
 
     void reverseSequence(nd4j::LaunchContext * context, const NDArray* input, const NDArray* seqLengths, NDArray* output, int seqDim, const int batchDim) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), _reverseSequence, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
+        BUILD_SINGLE_SELECTOR(input->dataType(), reverseSequence_, (context, input, seqLengths, output, seqDim, batchDim), LIBND4J_TYPES);
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index 89cedc825..b6f0c215a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -391,7 +391,7 @@ static void scatterCudaLauncher(const int blocksPerGrid, const int threadsPerBlo
 ///////////////////////////////////////////////////////////////////
 void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& indices, const NDArray& updates, NDArray& output, const bool lock) {
 
-    PointersManager manager(context, "scatterND");
+    PointersManager manager(context, "scatter");
 
     NDArray::prepareSpecialUse({&output}, {&updates, &indices});
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment.cu
index 094420857..67cb77b5c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment.cu
@@ -19,6 +19,7 @@
 //
 
 #include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
 #include <NDArrayFactory.h>
 #include <helpers/ShapeUtils.h>
 #include <helpers/TAD.h>
@@ -30,933 +31,20 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
-    // -------------------------------------------------------------------------------------------------------------- //
-    // Segment ops linear kernels
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void segmentMaxLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-         __shared__ T* val;
-         __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-         __shared__ T* x;
-         __shared__ T* z;
-         __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            extern __shared__ unsigned char shmem[];
-            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-                start = starts[segment];
-                finish = start + lengths[segment];
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
-                val[segment] = z[zIndex];
-            }
-
-        }
-        __syncthreads();
-
-         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-             auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-             nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
-         }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void unsortedSegmentMaxLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ I* y; //int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = blockIdx.x;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            y = reinterpret_cast<I*>(indices);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-            //start = starts[segment];
-            //finish = start + lengths[segment];
-            if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
-            else
-                z[zIndex] = -DataTypeUtils::max<T>();
-        }
-        __syncthreads();
-        if (lengths[segment] > 0)
-        for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-            auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
-            if (y[yIndex] == segment) {
-                nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
-            }
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void segmentMinLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            extern __shared__ unsigned char shmem[];
-            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-                start = starts[segment];
-                finish = start + lengths[segment];
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
-                val[segment] = z[zIndex];
-            }
-
-        }
-        __syncthreads();
-
-        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-           nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
-        }
-
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void unsortedSegmentMinLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ I* y; //int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = blockIdx.x;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            y = reinterpret_cast<I*>(indices);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-            if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
-            else
-                z[zIndex] = DataTypeUtils::max<T>();
-
-        }
-        __syncthreads();
-        if (lengths[segment] > 0)
-            for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
-                if (y[yIndex] == segment) {
-                    nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
-                }
-            }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void segmentSumLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-
-            if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-                start = starts[segment];
-                finish = start + lengths[segment];
-                //val[segment] = ;
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
-            }
-
-        }
-        __syncthreads();
-
-        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-            nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void unsortedSegmentSumLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ I* y; //int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = blockIdx.x;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            y = reinterpret_cast<I*>(indices);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-            if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
-            else
-                z[zIndex] = 0; //DataTypeUtils::max<T>();
-        }
-        __syncthreads();
-
-        if (lengths[segment] > 0)
-            for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
-                if (y[yIndex] == segment && e != starts[segment]) {
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
-                }
-            }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void segmentMeanLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-//            extern __shared__ unsigned char shmem[];
-//            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            //[zIndex] =
-            if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-                start = starts[segment];
-                finish = start + lengths[segment];
-                //val[segment] = ;
-                z[zIndex] = T(x[shape::getIndexOffset(start, inputShape, xLen)] / lengths[segment]);
-//                val[segment] = z[zIndex];
-            }
-
-        }
-        __syncthreads();
-
-        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-            if (lengths[segment])
-            nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex] / lengths[segment]));
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ I* y; //int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-//            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x;// / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            y = reinterpret_cast<I*>(indices);
-//            extern __shared__ unsigned char shmem[];
-//            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-//            if (segment < numOfClasses) {
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-            //start = starts[segment];
-            //finish = start + lengths[segment];
-            if (lengths[segment] > 0)
-                z[zIndex] = T(x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / T(lengths[segment]));
-            else
-                z[zIndex] = 0; //DataTypeUtils::max<T>();
-//                val[segment] = z[zIndex];
-//            }
-
-        }
-        __syncthreads();
-        if (lengths[segment] > 0)
-            for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
-                if (y[yIndex] == segment && e != starts[segment]) {
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/T(lengths[segment])));
-                }
-            }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void segmentProdLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            extern __shared__ unsigned char shmem[];
-            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-            if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-                start = starts[segment];
-                finish = start + lengths[segment];
-                //val[segment] = ;
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
-                val[segment] = z[zIndex];
-            }
-
-        }
-        __syncthreads();
-//         auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-//         auto step = blockDim.x * gridDim.x;
-
-        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-            nd4j::math::atomics::nd4j_atomicMul(&val[segment], x[xIndex]);
-        }
-        __syncthreads();
-
-        if (threadIdx.x == 0) {
-            z[zIndex] = val[segment];
-        }
-
-    }
-    template <typename T, typename I>
-    static __global__ void unsortedSegmentProdLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ I* y; //int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-//            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x;// / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            y = reinterpret_cast<I*>(indices);
-//            extern __shared__ unsigned char shmem[];
-//            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-//            if (segment < numOfClasses) {
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-            //start = starts[segment];
-            //finish = start + lengths[segment];
-            if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
-            else
-                z[zIndex] = 0; //DataTypeUtils::max<T>();
-//                val[segment] = z[zIndex];
-//            }
-
-        }
-        __syncthreads();
-        if (lengths[segment] > 0)
-            for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
-                if (y[yIndex] == segment && e != starts[segment]) {
-                    nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
-                }
-            }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    static __global__ void unsortedSegmentSqrtNLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
-        __shared__ T* val;
-        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
-        __shared__ T* x;
-        __shared__ T* z;
-        __shared__ I* y; //int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-//            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
-            segment = blockIdx.x;// / threadsPerSegment;
-            x = reinterpret_cast<T*>(input);
-            z = reinterpret_cast<T*>(output);
-            y = reinterpret_cast<I*>(indices);
-//            extern __shared__ unsigned char shmem[];
-//            val = reinterpret_cast<T*>(shmem);
-            xLen = shape::length(inputShape);
-            zLen = shape::length(outputShape);
-
-//            if (segment < numOfClasses) {
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
-            //start = starts[segment];
-            //finish = start + lengths[segment];
-            if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
-            else
-                z[zIndex] = 0; //DataTypeUtils::max<T>();
-//                val[segment] = z[zIndex];
-//            }
-
-        }
-        __syncthreads();
-        if (lengths[segment] > 0)
-            for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
-                if (y[yIndex] == segment && e != starts[segment]) {
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
-                }
-            }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    // fill up segments starts and ends - splitted ordered case
-    template <typename I>
-    static __global__ void fillUpSegmentsKernel(void* indices, Nd4jLong* indexShape, int numClasses, int* classesRangesStart, int* classesRangesLenghts) {
-        __shared__ I* idxBuf;
-        __shared__ Nd4jLong idxLen;
-        __shared__ int* result;
-        if (threadIdx.x == 0) {
-            idxBuf = reinterpret_cast<I*>(indices);
-            idxLen = shape::length(indexShape);
-        }
-        __syncthreads();
-
-        auto tid = threadIdx.x + blockDim.x * blockIdx.x;
-        auto step = blockDim.x * gridDim.x;
-
-        for (auto j = tid; j < idxLen; j += step) {
-            auto pos = idxBuf[j];
-            nd4j::math::atomics::nd4j_atomicMin(&classesRangesStart[pos], (int)j);
-            nd4j::math::atomics::nd4j_atomicAdd(&classesRangesLenghts[pos], 1);
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    // fill up segments starts and counts - cumulative  case
-    template <typename I>
-    static __global__ void fillUpUnsortedSegmentsKernel(void* indices, Nd4jLong* indexShape, int numClasses, int* classes) {
-        __shared__ I* idxBuf;
-        __shared__ Nd4jLong idxLen;
-        __shared__ int* result;
-        if (threadIdx.x == 0) {
-            idxBuf = reinterpret_cast<I*>(indices);
-            idxLen = shape::length(indexShape);
-        }
-        __syncthreads();
-
-        auto tid = threadIdx.x + blockDim.x * blockIdx.x;
-        auto step = blockDim.x * gridDim.x;
-
-        for (auto j = tid; j < idxLen; j += step) {
-            auto k = idxBuf[j];
-            auto beginPos = 2 * k;
-            auto sizePos = beginPos + 1;
-            printf("%d, %d\n", beginPos, sizePos);
-            nd4j::math::atomics::nd4j_atomicMin(&classes[beginPos], (int)j);
-            nd4j::math::atomics::nd4j_atomicAdd(&classes[sizePos], 1);
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    // segment ops multidimentional cases
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static __global__ void segmentMaxTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads,
-            Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf,
-            Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets, T filler = 0) {
-
-        __shared__ T* val;
-        __shared__ Nd4jLong len, segment, zIndex, total;
-        __shared__ T* z;
-        __shared__ int start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = indices[blockIdx.x]; // / threadsPerSegment;
-            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
-            len = shape::length(inputTads);
-
-            start = starts[segment];
-            finish = start + lengths[segment];
-            total = shape::sizeAt(inputShape, 0);
-        }
-        __syncthreads();
-
-        auto idx = blockIdx.x;
-        if (blockIdx.x <= total) {
-            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
-            if (blockIdx.x == start) {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    z[zIndex] = x[xIndex];
-                }
-            }
-            else {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
-                }
-            }
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // SegmentMin kernel
-    template <typename T, typename I>
-    static __global__ void segmentMinTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
-        __shared__ T* val;
-        __shared__ Nd4jLong len, segment, zIndex, total;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = indices[blockIdx.x]; // / threadsPerSegment;
-            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
-            len = shape::length(inputTads);
-            start = starts[segment];
-            finish = start + lengths[segment];
-            total = shape::sizeAt(inputShape, 0);
-
-        }
-        __syncthreads();
-
-        auto idx = blockIdx.x;
-        if (blockIdx.x <= total) {
-            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
-            if (blockIdx.x == start) {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    z[zIndex] = x[xIndex];
-                }
-            }
-            else {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
-                }
-            }
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // SegmentSum kernel
-    template <typename T, typename I>
-    static __global__ void segmentSumTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
-        __shared__ T* val;
-        __shared__ Nd4jLong len, segment, zIndex, total;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = indices[blockIdx.x]; // / threadsPerSegment;
-            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
-            len = shape::length(inputTads);
-            start = starts[segment];
-            finish = start + lengths[segment];
-            total = shape::sizeAt(inputShape, 0);
-
-        }
-        __syncthreads();
-
-        auto idx = blockIdx.x;
-        if (blockIdx.x <= total) {
-            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
-            if (blockIdx.x == start) {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    z[zIndex] = x[xIndex];
-                }
-            }
-            else {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    if (lengths[segment])
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
-                }
-            }
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // SegmentMean kernel
-    template <typename T, typename I>
-    static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
-        __shared__ T* val;
-        __shared__ Nd4jLong len, segment, zIndex, total;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = indices[blockIdx.x]; // / threadsPerSegment;
-            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
-            len = shape::length(inputTads);
-            start = starts[segment];
-            finish = start + lengths[segment];
-            total = shape::sizeAt(inputShape, 0);
-
-        }
-        __syncthreads();
-
-        auto idx = blockIdx.x;
-        if (blockIdx.x <= total) {
-            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
-            if (blockIdx.x == start) {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    z[zIndex] = T(x[xIndex]/lengths[segment]);
-                }
-            }
-            else {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    if (lengths[segment])
-                        nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
-                }
-            }
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // SegmentProd kernel
-    template <typename T, typename I>
-    static __global__ void segmentProdTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
-        __shared__ T* val;
-        __shared__ Nd4jLong len, segment, zIndex, total;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = indices[blockIdx.x]; // / threadsPerSegment;
-            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
-            len = shape::length(inputTads);
-            start = starts[segment];
-            finish = start + lengths[segment];
-            total = shape::sizeAt(inputShape, 0);
-
-        }
-        __syncthreads();
-
-        auto idx = blockIdx.x;
-        if (blockIdx.x <= total) {
-            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
-            if (blockIdx.x == start) {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    z[zIndex] = x[xIndex];
-                }
-            }
-            else {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
-                }
-            }
-        }
-    }
-    // SegmentSqrtN kernel
-    template <typename T, typename I>
-    static __global__ void segmentSqrtNTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
-        __shared__ T* val;
-        __shared__ Nd4jLong len, segment, zIndex, total;
-        __shared__ T* z;
-        __shared__ int threadsPerSegment, start, finish;
-
-        if (threadIdx.x == 0) {
-            segment = indices[blockIdx.x]; // / threadsPerSegment;
-            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
-            len = shape::length(inputTads);
-            start = starts[segment];
-            finish = start + lengths[segment];
-            total = shape::sizeAt(inputShape, 0);
-
-        }
-        __syncthreads();
-
-        auto idx = blockIdx.x;
-        if (blockIdx.x <= total) {
-            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
-            if (blockIdx.x == start) {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    z[zIndex] = x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
-                }
-            }
-            else {
-                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
-                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
-                }
-            }
-        }
-    }
-
     // -------------------------------------------------------------------------------------------------------------- //
     // Sorted segments ops implementations
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static void segmentMaxFunctor_(LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
-        //int numClasses = output->sizeAt(0);
-        // if input is a vector: (as if in doc sample)
-        //Nd4jLong idx = indices->e<Nd4jLong>(0);
-        auto stream = context->getCudaStream();
-        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(256, 512, 256);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        NDArray::prepareSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
-
-        if (input->isVector()) {
-            segmentMaxLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            segmentMaxTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
-    }
-
-    // segmen min 
-    template <typename T, typename I>
-    static void segmentMinFunctor_(LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
-        auto stream = context->getCudaStream();
-        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-        NDArray::prepareSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
-
-        if (input->isVector()) {
-            segmentMinLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            segmentMinTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
-
-    }
-
-    // segmen mean
-    template <typename T, typename I>
-    static void segmentMeanFunctor_(LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
-        auto stream = context->getCudaStream();
-        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        if (input->isVector()) {
-            segmentMeanLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            segmentMeanTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
-
-    template <typename T, typename I>
-    static void segmentSumFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
-        auto stream = context->getCudaStream();
-        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        if (input->isVector()) {
-            segmentSumLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            segmentSumTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
-
-    template <typename T, typename I>
-    static void segmentProdFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
-        auto stream = context->getCudaStream();
-        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        if (input->isVector()) {
-            segmentProdLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            segmentProdTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
 
     template <typename T, typename I>
     static bool segmentIndicesValidate_(NDArray* indices, NDArray& aexpected, NDArray& aoutput) {
         return true;
     }
 
-    void segmentMaxFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentMaxFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-
-    void segmentMinFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentMinFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-
-    void segmentMeanFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), segmentMeanFunctor_, (context, input, indices, output), FLOAT_TYPES, INTEGER_TYPES);
-    }
-
-    void segmentSumFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentSumFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-
-    void segmentProdFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), segmentProdFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-
     bool segmentIndicesValidate(nd4j::LaunchContext* context , NDArray* indices, NDArray& expected, NDArray& output) {
         BUILD_DOUBLE_SELECTOR(output.dataType(), indices->dataType(), return segmentIndicesValidate_, (indices, expected, output), NUMERIC_TYPES, INTEGER_TYPES);
     }
 
     BUILD_DOUBLE_TEMPLATE(template bool segmentIndicesValidate_, (NDArray*, NDArray&, NDArray&), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void segmentProdFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void segmentSumFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void segmentMeanFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void segmentMinFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void segmentMaxFunctor_, (LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
 
-    // -------------------------------------------------------------------------------------------------------------- //
 
     // -------------------------------------------------------------------------------------------------------------- //
     // Unsorted segment ops functors implementation
@@ -1002,1158 +90,46 @@ namespace helpers {
     BUILD_SINGLE_TEMPLATE(template bool unsortedSegmentIndicesValidate_, (nd4j::LaunchContext* context , NDArray* indices, Nd4jLong expected, Nd4jLong& output), INTEGER_TYPES);
     // -------------------------------------------------------------------------------------------------------------- //
 
-    template <typename T, typename I>
-    static void unsortedSegmentMaxFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
-//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
-//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numOfClasses, begins, lengths);
-        classesRangesBegs.syncToHost();
-        classesRangesLens.syncToHost();
-
-        if (input->isVector()) {
-            unsortedSegmentMaxLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            dims.x = input->sizeAt(0);
-            output->assign(-DataTypeUtils::max<T>());
-            segmentMaxTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
     // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static void unsortedSegmentMinFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
-//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
-//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numOfClasses, begins, lengths);
-
-        if (input->isVector()) {
-            unsortedSegmentMinLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            output->assign(DataTypeUtils::max<T>());
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            dims.x = input->sizeAt(0);
-            segmentMinTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
-//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
-//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numOfClasses, begins, lengths);
-
-        if (input->isVector()) {
-            unsortedSegmentMeanLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            output->assign(0);
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            dims.x = input->sizeAt(0);
-            segmentMeanTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static void unsortedSegmentSumFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
-//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numOfClasses, indices->lengthOf(), (numOfClasses + 1) * 64);
-//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numOfClasses, begins, lengths);
-
-        if (input->isVector()) {
-            unsortedSegmentSumLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            output->assign(0);
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            dims.x = input->sizeAt(0);
-            segmentSumTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static void unsortedSegmentProdFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
-//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
-//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numOfClasses, begins, lengths);
-
-        if (input->isVector()) {
-            unsortedSegmentProdLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            output->assign(1);
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            dims.x = input->sizeAt(0);
-            segmentProdTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static void unsortedSegmentSqrtNFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
-//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
-//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
-//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numOfClasses, begins, lengths);
-
-        if (input->isVector()) {
-            unsortedSegmentSqrtNLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            output->assign(0);
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            dims.x = input->sizeAt(0);
-            segmentSqrtNTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
-        }
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    // -------------------------------------------------------------------------------------------------------------- //
-    // unsorted ops functors
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    void unsortedSegmentMaxFunctor(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMaxFunctor_, (context, input, indices, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    void unsortedSegmentMinFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMinFunctor_, (context, input, indices, numOfClasses, output),
-                              NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    void unsortedSegmentMeanFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output),
-                              FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    void unsortedSegmentSumFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentSumFunctor_, (context, input, indices, numOfClasses, output),
-                              NUMERIC_TYPES, INTEGER_TYPES);
-
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    void unsortedSegmentProdFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentProdFunctor_, (context, input, indices, numOfClasses, output),
-                              FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    void unsortedSegmentSqrtNFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentSqrtNFunctor_, (context, input, indices, numOfClasses, output),
-                              FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentMaxFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentMinFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentMeanFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentSumFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentProdFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentSqrtNFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    // Backpropagate ops helpers
-    // -------------------------------------------------------------------------------------------------------------- //
-    // Sorted backpropagate ops
-    // -------------------------------------------------------------------------------------------------------------- //
-    // segment max
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    static __global__ void segmentMaxBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
-            Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
-            void* outputBuf, Nd4jLong* outputShape) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, gradLen;
-
+    // fill up segments starts and ends - splitted ordered case
+    template <typename I>
+    static __global__ void fillUpSegmentsKernel(void* indices, Nd4jLong* indexShape, int numClasses, int* classesRangesStart, int* classesRangesLenghts) {
+        __shared__ I* idxBuf;
+        __shared__ Nd4jLong idxLen;
+        __shared__ int* result;
         if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            gradIn = reinterpret_cast<T*>(forwardOutput);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-        }
-
-        auto start = blockIdx.x * blockDim.x + threadIdx.x;
-        auto step = gridDim.x * blockDim.x;
-
-        for (auto e = start; e < xLen; e += step) {
-
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
-            auto classIndex = y[yOffset];
-            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
-
-            if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
-                z[zOffset] = gradOut[gradOffsetO];
-            }
-        }
-    }
-    template <typename T, typename I>
-    static __global__ void segmentSumBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
-            void* indicesBuf, Nd4jLong* indicesShape, void* outputBuf, Nd4jLong* outputShape) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, gradLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-        }
-
-        auto start = blockIdx.x * blockDim.x + threadIdx.x;
-        auto step = gridDim.x * blockDim.x;
-
-        for (auto e = start; e < xLen; e += step) {
-
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
-            auto classIndex = y[yOffset];
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
-
-            z[zOffset] = gradOut[gradOffsetO];
-        }
-    }
-
-    template <typename T, typename I>
-    static __global__ void segmentProdBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
-                                                    Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
-                                                    void* outputBuf, Nd4jLong* outputShape) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, gradLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            gradIn = reinterpret_cast<T*>(forwardOutput);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-        }
-
-        auto start = blockIdx.x * blockDim.x + threadIdx.x;
-        auto step = gridDim.x * blockDim.x;
-
-        for (auto e = start; e < xLen; e += step) {
-
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
-            auto classIndex = y[yOffset];
-            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
-
-            z[zOffset] = gradOut[gradOffsetO]  * gradIn[gradOffsetI] / x[xOffset];
-        }
-    }
-
-    template <typename T, typename I>
-    static __global__ void segmentMeanBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
-                                                     int* lengths, void* outputBuf, Nd4jLong* outputShape) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, gradLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-        }
-
-        auto start = blockIdx.x * blockDim.x + threadIdx.x;
-        auto step = gridDim.x * blockDim.x;
-
-        for (auto e = start; e < xLen; e += step) {
-
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
-            auto classIndex = y[yOffset];
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
-
-            z[zOffset] = T(gradOut[gradOffsetO] / float(lengths[classIndex]));
-        }
-    }
-
-    template <typename T, typename I>
-    static __global__ void segmentSqrtNBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
-                                                     int* lengths, void* outputBuf, Nd4jLong* outputShape) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, gradLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-        }
-
-        auto start = blockIdx.x * blockDim.x + threadIdx.x;
-        auto step = gridDim.x * blockDim.x;
-
-        for (auto e = start; e < xLen; e += step) {
-
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
-            auto classIndex = y[yOffset];
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
-
-            z[zOffset] = T(gradOut[gradOffsetO] / math::nd4j_sqrt<int, float>(lengths[classIndex]));
-        }
-    }
-
-    template <typename T, typename I>
-    static __global__ void segmentMaxBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
-                                                    Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
-                                                    void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
-                                                    Nd4jLong* inputOffsets, Nd4jLong* gradInTad, Nd4jLong* gradInOffsets,
-                                                    Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad,
-                                                    Nd4jLong* outOffsets) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            yLen = shape::length(indicesShape);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradIn = reinterpret_cast<T*>(forwardOutput);
-            gradLen = shape::length(epsShape);
-            currentLen = shape::length(outTad);
-        }
-
-        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
-            auto segment = y[yIndex];
-            T* current = x + inputOffsets[i];
-            T* currentOut = z + outOffsets[i];
-            T* in = gradIn + gradInOffsets[segment];
-            T* outGrad = gradOut + gradOutOffsets[segment];
-
-            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                if (nd4j::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
-                    currentOut[e] = outGrad[e];
-            }
-        }
-    }
-
-    template <typename T, typename I>
-    static __global__ void segmentSumBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
-            void* indicesBuf, Nd4jLong* indicesShape, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* inputTad,
-            Nd4jLong* inputOffsets, Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad, Nd4jLong* outOffsets) {
-        __shared__ T* x;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            yLen = shape::length(indicesShape);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-            currentLen = shape::length(outTad);
-        }
-
-        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
-            auto segment = y[yIndex];
-            T* currentOut = z + outOffsets[i];
-            T* outGrad = gradOut + gradOutOffsets[segment];
-
-            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                currentOut[e] = outGrad[e];
-            }
-        }
-
-    }
-    template <typename T, typename I>
-    static __global__ void segmentMeanBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
-            void* indicesBuf, Nd4jLong* indicesShape, int* lengths, void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
-            Nd4jLong* inputOffsets, Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad, Nd4jLong* outOffsets) {
-        __shared__ T* x;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            yLen = shape::length(indicesShape);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-            currentLen = shape::length(outTad);
+            idxBuf = reinterpret_cast<I*>(indices);
+            idxLen = shape::length(indexShape);
         }
         __syncthreads();
 
-        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-//            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
-            auto segment = y[i]; //yIndex];
-            T* currentOut = z + outOffsets[i];
-            T* outGrad = gradOut + gradOutOffsets[segment];
+        auto tid = threadIdx.x + blockDim.x * blockIdx.x;
+        auto step = blockDim.x * gridDim.x;
 
-            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                auto zIndex = shape::getIndexOffset(e, outTad, currentLen);
-                auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen);
-                if (lengths[segment] > 0)
-                currentOut[zIndex] = T(outGrad[gradIndex] / float(lengths[segment]));
-            }
-        }
-    }
-    template <typename T, typename I>
-    static __global__ void segmentSqrtNBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
-                                                  void* indicesBuf, Nd4jLong* indicesShape, int* lengths, void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
-                                                  Nd4jLong* inputOffsets, Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad, Nd4jLong* outOffsets) {
-        __shared__ T* x;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
-
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            yLen = shape::length(indicesShape);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradLen = shape::length(epsShape);
-            currentLen = shape::length(outTad);
-        }
-        __syncthreads();
-
-        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-//            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
-            auto segment = y[i]; //yIndex];
-            T* currentOut = z + outOffsets[i];
-            T* outGrad = gradOut + gradOutOffsets[segment];
-
-            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                auto zIndex = shape::getIndexOffset(e, outTad, currentLen);
-                auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen);
-                if (lengths[segment] > 0)
-                    currentOut[zIndex] = T(outGrad[gradIndex] / math::nd4j_sqrt<int, float>(lengths[segment]));
-            }
+        for (auto j = tid; j < idxLen; j += step) {
+            auto pos = idxBuf[j];
+            nd4j::math::atomics::nd4j_atomicMin<int>(&classesRangesStart[pos], (int)j);
+            nd4j::math::atomics::nd4j_atomicAdd<int>(&classesRangesLenghts[pos], 1);
         }
     }
 
-    template <typename T, typename I>
-    static __global__ void segmentProdBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
-                                                 Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
-                                                 void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
-                                                 Nd4jLong* inputOffsets, Nd4jLong* gradInTad, Nd4jLong* gradInOffsets,
-                                                 Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad,
-                                                 Nd4jLong* outOffsets) {
-        __shared__ T* x;
-        __shared__ T* gradIn;
-        __shared__ T* gradOut;
-        __shared__ I* y;
-        __shared__ T* z;
-        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+        // -------------------------------------------------------------------------------------------------------------- //
 
-        if (threadIdx.x == 0) {
-            xLen = shape::length(inputShape);
-            x = reinterpret_cast<T*>(inputBuf);
-            y = reinterpret_cast<I*>(indicesBuf);
-            z = reinterpret_cast<T*>(outputBuf);
-            yLen = shape::length(indicesShape);
-            gradOut = reinterpret_cast<T*>(eps);
-            gradIn = reinterpret_cast<T*>(forwardOutput);
-            gradLen = shape::length(epsShape);
-            currentLen = shape::length(outTad);
+        template <typename I>
+        static void fillUpSegments_(NDArray* indices, Nd4jLong numClasses, NDArray& classesRangesBegs, NDArray& classesRangesLens) {
+            dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+            int* begins = reinterpret_cast<int*>(classesRangesBegs.getSpecialBuffer());
+            int* lengths = reinterpret_cast<int*>(classesRangesLens.getSpecialBuffer());
+            auto stream = classesRangesBegs.getContext()->getCudaStream();
+            fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream >>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
         }
+        // -------------------------------------------------------------------------------------------------------------- //
 
-        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
-            auto segment = y[yIndex];
-            T* current = x + inputOffsets[i];
-            T* currentOut = z + outOffsets[i];
-            T* in = gradIn + gradInOffsets[segment];
-            T* outGrad = gradOut + gradOutOffsets[segment];
-
-            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                    currentOut[e] = outGrad[e] * in[e] / current[e];
-            }
+        void fillUpSegments(NDArray* indices, Nd4jLong numClasses, NDArray& classesRangesBegs, NDArray& classesRangesLens) {
+            BUILD_SINGLE_SELECTOR(indices->dataType(), fillUpSegments_, (indices, numClasses, classesRangesBegs, classesRangesLens), INTEGER_TYPES);
         }
-
-    }
-
-    template <typename T, typename I>
-    int segmentMaxFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        //int numOfClasses = gradOut->sizeAt(0);
-        // if input is a vector: (as if in doc sample)
-        auto stream = context->getCudaStream();
-        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
-        segmentMaxFunctor_<T, I>(context, input, indices, &tempRes);
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentMaxBPLinearKernel<T,I><<<1 + gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
-            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentMaxBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    int segmentMinFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        //int numOfClasses = gradOut->sizeAt(0);
-        // if input is a vector: (as if in doc sample)
-        auto stream = context->getCudaStream();
-        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
-        segmentMinFunctor_<T, I>(context, input, indices, &tempRes);
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentMaxBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
-            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentMaxBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    int segmentSumFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentSumBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
-                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentSumBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    int segmentMeanFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentMeanBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
-                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), lengths, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentMeanBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    gradOut->specialBuffer(), gradOut->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), lengths,
-                    output->specialBuffer(), output->specialShapeInfo(), inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-    template <typename T, typename I>
-    int segmentProdFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
-        segmentProdFunctor_<T, I>(context, input, indices, &tempRes);
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        if (input->isVector()) {
-            Nd4jLong loopSize = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentProdBPLinearKernel<T,I><<<gradOut->lengthOf(), loopSize, 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
-            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentProdBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-        return Status::OK();
-    }
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    int segmentMaxFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMaxFunctorBP_, (context, input,
-                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    // segmen min
-    int segmentMinFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMinFunctorBP_, (context, input,
-                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    // segmen mean
-    int segmentMeanFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMeanFunctorBP_, (context, input,
-                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int segmentSumFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentSumFunctorBP_, (context, input,
-                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int segmentProdFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentProdFunctorBP_, (context, input,
-                indices, gradOut, output), FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    BUILD_DOUBLE_TEMPLATE(template int segmentMaxFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int segmentMinFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int segmentSumFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int segmentMeanFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int segmentProdFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-
-    // -------------------------------------------------------------------------------------------------------------- //
-    // Unsorted backpropagate segment ops
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static int unsortedSegmentMaxFunctorBP_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        //int numOfClasses = gradOut->sizeAt(0);
-        // if input is a vector: (as if in doc sample)
-        auto stream = context->getCudaStream();
-        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
-        unsortedSegmentMaxFunctor_<T, I>(context, input, indices, numOfClasses, &tempRes);
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentMaxBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
-            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentMaxBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-
-    template <typename T, typename I>
-    static int unsortedSegmentMinFunctorBP_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        //int numOfClasses = gradOut->sizeAt(0);
-        // if input is a vector: (as if in doc sample)
-        auto stream = context->getCudaStream();
-        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
-        unsortedSegmentMinFunctor_<T, I>(context, input, indices, numOfClasses, &tempRes);
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentMaxBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
-            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentMaxBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-
-    template <typename T, typename I>
-    static int unsortedSegmentMeanFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentMeanBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
-                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), lengths, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentMeanBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    gradOut->specialBuffer(), gradOut->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), lengths,
-                    output->specialBuffer(), output->specialShapeInfo(), inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static int unsortedSegmentSumFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentSumBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
-                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentSumBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static int unsortedSegmentProdFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
-        unsortedSegmentProdFunctor_<T, I>(context, input, indices, numOfClasses, &tempRes);
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        if (input->isVector()) {
-            Nd4jLong loopSize = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentProdBPLinearKernel<T,I><<<gradOut->lengthOf(), loopSize, 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
-            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentProdBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
-                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-        return Status::OK();
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    template <typename T, typename I>
-    static int unsortedSegmentSqrtNFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        auto stream = context->getCudaStream();
-        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
-        auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
-        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
-        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
-
-        classesRangesBegs.assign(indices->lengthOf());
-        classesRangesLens.assign(0);
-        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
-        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
-        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
-        fillUpSegmentsKernel<I><<<dims.x, dims.y, dims.z, *stream>>>(indices->specialBuffer(), indices->specialShapeInfo(), numClasses, begins, lengths);
-
-        if (input->isVector()) {
-            Nd4jLong loop_size = input->lengthOf();
-            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
-            segmentSqrtNBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
-                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
-                    indices->specialBuffer(), indices->specialShapeInfo(), lengths, output->specialBuffer(), output->specialShapeInfo());
-        }
-        else {
-            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
-            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
-            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
-//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
-            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
-            Nd4jLong* inputTads = packX.specialShapeInfo();
-            Nd4jLong* inputTadOffsets = packX.specialOffsets();
-            Nd4jLong* outputTads = packZ.specialShapeInfo();
-            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
-            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
-            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
-
-            segmentSqrtNBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
-                    gradOut->specialBuffer(), gradOut->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), lengths,
-                    output->specialBuffer(), output->specialShapeInfo(), inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
-                    outputTads, outputTadOffsets);
-        }
-        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
-
-        return Status::OK();
-    }
-    // ============================================================================================================== //
-    int unsortedSegmentMaxFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMaxFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int unsortedSegmentMinFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMinFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int unsortedSegmentSumFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentSumFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int unsortedSegmentMeanFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMeanFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int unsortedSegmentProdFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentProdFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    int unsortedSegmentSqrtNFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
-        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentSqrtNFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INTEGER_TYPES);
-    }
-    // -------------------------------------------------------------------------------------------------------------- //
-
-    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentMaxFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentMinFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentSumFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentMeanFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentProdFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentSqrtNFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
-    // -------------------------------------------------------------------------------------------------------------- //
+        BUILD_SINGLE_TEMPLATE(template void fillUpSegments_, (NDArray* indices, Nd4jLong numClasses, NDArray& classesRangesBegs, NDArray& classesRangesLens), INTEGER_TYPES);
+        // -------------------------------------------------------------------------------------------------------------- //
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
new file mode 100644
index 000000000..a1792750f
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
@@ -0,0 +1,427 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author GS <sgazeos@gmail.com>
+//
+
+#include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
+
+#include <NDArrayFactory.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <PointersManager.h>
+#include <ConstantTadHelper.h>
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+
+            // -------------------------------------------------------------------------------------------------------------- //
+            // Segment ops linear kernels
+            // -------------------------------------------------------------------------------------------------------------- //
+
+            template<typename T, typename I>
+            static __global__ void
+            segmentMaxLinearKernel(void *input, Nd4jLong *inputShape, int *starts, int *lengths, Nd4jLong numOfClasses,
+                                   void *output, Nd4jLong *outputShape) {
+                __shared__
+                T *val;
+                __shared__
+                Nd4jLong xLen, zLen, segment, zIndex;
+                __shared__
+                T *x;
+                __shared__
+                T *z;
+                __shared__ int threadsPerSegment, start, finish;
+
+                if (threadIdx.x == 0) {
+                    threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+                    segment = blockIdx.x / threadsPerSegment;
+                    x = reinterpret_cast<T *>(input);
+                    z = reinterpret_cast<T *>(output);
+                    extern __shared__ unsigned char shmem[];
+                    val = reinterpret_cast<T *>(shmem);
+                    xLen = shape::length(inputShape);
+                    zLen = shape::length(outputShape);
+
+                    if (segment < numOfClasses) {
+                        zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                        start = starts[segment];
+                        finish = start + lengths[segment];
+                        z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                        val[segment] = z[zIndex];
+                    }
+
+                }
+                __syncthreads();
+
+                for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                    nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                }
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+
+            template<typename T, typename I>
+            static __global__ void
+            unsortedSegmentMaxLinearKernel(void *input, Nd4jLong *inputShape, void *indices, Nd4jLong *indicesShape,
+                                           int *starts, int *lengths, Nd4jLong numOfClasses, void *output,
+                                           Nd4jLong *outputShape) {
+                __shared__
+                T *val;
+                __shared__
+                Nd4jLong xLen, zLen, segment, zIndex;
+                __shared__
+                T *x;
+                __shared__
+                T *z;
+                __shared__
+                I *y; //int threadsPerSegment, start, finish;
+
+                if (threadIdx.x == 0) {
+                    segment = blockIdx.x;
+                    x = reinterpret_cast<T *>(input);
+                    z = reinterpret_cast<T *>(output);
+                    y = reinterpret_cast<I *>(indices);
+                    xLen = shape::length(inputShape);
+                    zLen = shape::length(outputShape);
+
+                    zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                    //start = starts[segment];
+                    //finish = start + lengths[segment];
+                    if (lengths[segment] > 0)
+                        z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+                    else
+                        z[zIndex] = -DataTypeUtils::max<T>();
+                }
+                __syncthreads();
+                if (lengths[segment] > 0)
+                    for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
+                        auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                        auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                        if (y[yIndex] == segment) {
+                            nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                        }
+                    }
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            template <typename T, typename I>
+            static __global__ void segmentMaxTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads,
+                                                       Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf,
+                                                       Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets, T filler = 0) {
+
+                __shared__ T* val;
+                __shared__ Nd4jLong len, segment, zIndex, total;
+                __shared__ T* z;
+                __shared__ int start, finish;
+
+                if (threadIdx.x == 0) {
+                    segment = indices[blockIdx.x]; // / threadsPerSegment;
+                    z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
+                    len = shape::length(inputTads);
+
+                    start = starts[segment];
+                    finish = start + lengths[segment];
+                    total = shape::sizeAt(inputShape, 0);
+                }
+                __syncthreads();
+
+                auto idx = blockIdx.x;
+                if (blockIdx.x <= total) {
+                    auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
+                    if (blockIdx.x == start) {
+                        for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                            auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                            auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                            z[zIndex] = x[xIndex];
+                        }
+                    }
+                    else {
+                        for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                            auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                            auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                            nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
+                        }
+                    }
+                }
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+
+            template <typename T, typename I>
+            static void segmentMaxFunctor_(LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+                //int numClasses = output->sizeAt(0);
+                // if input is a vector: (as if in doc sample)
+                //Nd4jLong idx = indices->e<Nd4jLong>(0);
+                auto stream = context->getCudaStream();
+                indices->syncToHost();
+                Nd4jLong numOfClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
+                NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+                NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+
+                classesRangesBegs.assign(indices->lengthOf());
+                classesRangesLens.assign(0);
+                dim3 dims(256, 512, 256);
+                int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+                int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+                fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+
+                NDArray::prepareSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
+
+                if (input->isVector()) {
+
+                    segmentMaxLinearKernel<T,I><<<numOfClasses, input->lengthOf(), numOfClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+                }
+                else {
+                    std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    Nd4jLong* inputTads = packX.specialShapeInfo();
+                    Nd4jLong* inputTadOffsets = packX.specialOffsets();
+                    Nd4jLong* outputTads = packZ.specialShapeInfo();
+                    Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+                    segmentMaxTadKernel<T,I><<<packX.numberOfTads(), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+                }
+                NDArray::registerSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            void segmentMaxFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+                BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentMaxFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
+            }
+            BUILD_DOUBLE_TEMPLATE(template void segmentMaxFunctor_, (LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+            // -------------------------------------------------------------------------------------------------------------- //
+
+            template <typename T, typename I>
+            static void unsortedSegmentMaxFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+                auto stream = context->getCudaStream();
+//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
+                NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+                NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
+//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+                classesRangesBegs.assign(indices->lengthOf());
+                classesRangesLens.assign(0);
+                dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
+//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
+                fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+                int* begins = reinterpret_cast<int*>(classesRangesBegs.getSpecialBuffer());
+                int* lengths = reinterpret_cast<int*>(classesRangesLens.getSpecialBuffer());
+
+                if (input->isVector()) {
+                    unsortedSegmentMaxLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+                }
+                else {
+                    std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    Nd4jLong* inputTads = packX.specialShapeInfo();
+                    Nd4jLong* inputTadOffsets = packX.specialOffsets();
+                    Nd4jLong* outputTads = packZ.specialShapeInfo();
+                    Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+                    dims.x = input->sizeAt(0);
+                    output->assign(-DataTypeUtils::max<T>());
+                    segmentMaxTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+                }
+
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            void unsortedSegmentMaxFunctor(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+                BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMaxFunctor_, (context, input, indices, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentMaxFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+            // -------------------------------------------------------------------------------------------------------------- //
+            // segment max
+            // -------------------------------------------------------------------------------------------------------------- //
+            template <typename T, typename I>
+            static __global__ void segmentMaxBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
+                                                            Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                            void* outputBuf, Nd4jLong* outputShape) {
+                __shared__ T* x;
+                __shared__ T* gradIn;
+                __shared__ T* gradOut;
+                __shared__ I* y;
+                __shared__ T* z;
+                __shared__ Nd4jLong xLen, gradLen;
+
+                if (threadIdx.x == 0) {
+                    xLen = shape::length(inputShape);
+                    x = reinterpret_cast<T*>(inputBuf);
+                    y = reinterpret_cast<I*>(indicesBuf);
+                    z = reinterpret_cast<T*>(outputBuf);
+                    gradIn = reinterpret_cast<T*>(forwardOutput);
+                    gradOut = reinterpret_cast<T*>(eps);
+                    gradLen = shape::length(epsShape);
+                }
+
+                auto start = blockIdx.x * blockDim.x + threadIdx.x;
+                auto step = gridDim.x * blockDim.x;
+
+                for (auto e = start; e < xLen; e += step) {
+
+                    auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
+                    auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
+                    auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+                    auto classIndex = y[yOffset];
+                    auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
+                    auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+
+                    if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
+                        z[zOffset] = gradOut[gradOffsetO];
+                    }
+                }
+            }
+
+            // -------------------------------------------------------------------------------------------------------------- //
+            template <typename T, typename I>
+            static __global__ void segmentMaxBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
+                                                         Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                         void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
+                                                         Nd4jLong* inputOffsets, Nd4jLong* gradInTad, Nd4jLong* gradInOffsets,
+                                                         Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad,
+                                                         Nd4jLong* outOffsets) {
+                __shared__ T* x;
+                __shared__ T* gradIn;
+                __shared__ T* gradOut;
+                __shared__ I* y;
+                __shared__ T* z;
+                __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+
+                if (threadIdx.x == 0) {
+                    xLen = shape::length(inputShape);
+                    x = reinterpret_cast<T*>(inputBuf);
+                    y = reinterpret_cast<I*>(indicesBuf);
+                    z = reinterpret_cast<T*>(outputBuf);
+                    yLen = shape::length(indicesShape);
+                    gradOut = reinterpret_cast<T*>(eps);
+                    gradIn = reinterpret_cast<T*>(forwardOutput);
+                    gradLen = shape::length(epsShape);
+                    currentLen = shape::length(outTad);
+                }
+
+                for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
+                    auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+                    auto segment = y[yIndex];
+                    T* current = x + inputOffsets[i];
+                    T* currentOut = z + outOffsets[i];
+                    T* in = gradIn + gradInOffsets[segment];
+                    T* outGrad = gradOut + gradOutOffsets[segment];
+
+                    for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
+                        if (nd4j::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
+                            currentOut[e] = outGrad[e];
+                    }
+                }
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            template <typename T, typename I>
+            int segmentMaxFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+                //int numOfClasses = gradOut->sizeAt(0);
+                // if input is a vector: (as if in doc sample)
+                auto stream = context->getCudaStream();
+                NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
+                segmentMaxFunctor_<T, I>(context, input, indices, &tempRes);
+                NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
+                if (input->isVector()) {
+                    Nd4jLong loop_size = input->lengthOf();
+                    auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+                    segmentMaxBPLinearKernel<T,I><<<1 + gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                            tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                            indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+                }
+                else {
+                    std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+                    auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+                    Nd4jLong* inputTads = packX.specialShapeInfo();
+                    Nd4jLong* inputTadOffsets = packX.specialOffsets();
+                    Nd4jLong* outputTads = packZ.specialShapeInfo();
+                    Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+                    Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
+                    Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
+                    Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+                    Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+                    segmentMaxBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                            tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                            indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                            inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
+                            outputTads, outputTadOffsets);
+                }
+                NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
+                return Status::OK();
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            int segmentMaxFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+                BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMaxFunctorBP_, (context, input,
+                        indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            BUILD_DOUBLE_TEMPLATE(template int segmentMaxFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+
+            // -------------------------------------------------------------------------------------------------------------- //
+            template <typename T, typename I>
+            static int unsortedSegmentMaxFunctorBP_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+                //int numOfClasses = gradOut->sizeAt(0);
+                // if input is a vector: (as if in doc sample)
+                auto stream = context->getCudaStream();
+                NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
+                unsortedSegmentMaxFunctor_<T, I>(context, input, indices, numOfClasses, &tempRes);
+                NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
+                if (input->isVector()) {
+                    Nd4jLong loop_size = input->lengthOf();
+                    auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+                    segmentMaxBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                            tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                            indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+                }
+                else {
+                    std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+                    auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+                    auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+                    auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+                    auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+                    Nd4jLong* inputTads = packX.specialShapeInfo();
+                    Nd4jLong* inputTadOffsets = packX.specialOffsets();
+                    Nd4jLong* outputTads = packZ.specialShapeInfo();
+                    Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+                    Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
+                    Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
+                    Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+                    Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+                    segmentMaxBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                            tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                            indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                            inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
+                            outputTads, outputTadOffsets);
+                }
+                NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
+                return Status::OK();
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            int unsortedSegmentMaxFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+                BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMaxFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
+            }
+            // -------------------------------------------------------------------------------------------------------------- //
+            BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentMaxFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
new file mode 100644
index 000000000..19c50728a
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
@@ -0,0 +1,414 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author GS <sgazeos@gmail.com>
+//
+
+#include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
+#include <NDArrayFactory.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <PointersManager.h>
+#include <ConstantTadHelper.h>
+
+namespace nd4j {
+namespace ops {
+namespace helpers {
+    // -------------------------------------------------------------------------------------------------------------- //
+    // Segment ops linear kernels
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentMeanLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
+        __shared__ T* val;
+        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__ T* x;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x / threadsPerSegment;
+            x = reinterpret_cast<T*>(input);
+            z = reinterpret_cast<T*>(output);
+//            extern __shared__ unsigned char shmem[];
+//            val = reinterpret_cast<T*>(shmem);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+            //[zIndex] =
+            if (segment < numOfClasses) {
+                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                start = starts[segment];
+                finish = start + lengths[segment];
+                //val[segment] = ;
+                z[zIndex] = T(x[shape::getIndexOffset(start, inputShape, xLen)] / lengths[segment]);
+//                val[segment] = z[zIndex];
+            }
+
+        }
+        __syncthreads();
+
+        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
+            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            if (lengths[segment])
+                nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex] / lengths[segment]));
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void unsortedSegmentMeanLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
+        __shared__ T* val;
+        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__ T* x;
+        __shared__ T* z;
+        __shared__ I* y; //int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+//            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x;// / threadsPerSegment;
+            x = reinterpret_cast<T*>(input);
+            z = reinterpret_cast<T*>(output);
+            y = reinterpret_cast<I*>(indices);
+//            extern __shared__ unsigned char shmem[];
+//            val = reinterpret_cast<T*>(shmem);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+//            if (segment < numOfClasses) {
+            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            //start = starts[segment];
+            //finish = start + lengths[segment];
+            if (lengths[segment] > 0)
+                z[zIndex] = T(x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / T(lengths[segment]));
+            else
+                z[zIndex] = 0; //DataTypeUtils::max<T>();
+//                val[segment] = z[zIndex];
+//            }
+
+        }
+        __syncthreads();
+        if (lengths[segment] > 0)
+            for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
+                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                if (y[yIndex] == segment && e != starts[segment]) {
+                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/T(lengths[segment])));
+                }
+            }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // SegmentMean kernel
+    template <typename T, typename I>
+    static __global__ void segmentMeanTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
+        __shared__ T* val;
+        __shared__ Nd4jLong len, segment, zIndex, total;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = indices[blockIdx.x]; // / threadsPerSegment;
+            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
+            len = shape::length(inputTads);
+            start = starts[segment];
+            finish = start + lengths[segment];
+            total = shape::sizeAt(inputShape, 0);
+
+        }
+        __syncthreads();
+
+        auto idx = blockIdx.x;
+        if (blockIdx.x <= total) {
+            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
+            if (blockIdx.x == start) {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    z[zIndex] = T(x[xIndex]/lengths[segment]);
+                }
+            }
+            else {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    if (lengths[segment])
+                        nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
+                }
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // segmen mean
+    template <typename T, typename I>
+    static void segmentMeanFunctor_(LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+        auto stream = context->getCudaStream();
+        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+
+        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+
+        if (input->isVector()) {
+            segmentMeanLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            segmentMeanTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void segmentMeanFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), segmentMeanFunctor_, (context, input, indices, output), FLOAT_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template void segmentMeanFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static void unsortedSegmentMeanFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
+//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
+//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
+        fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            unsortedSegmentMeanLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            output->assign(0);
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            dims.x = input->sizeAt(0);
+            segmentMeanTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void unsortedSegmentMeanFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMeanFunctor_, (context, input, indices, numOfClasses, output),
+                              FLOAT_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentMeanFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentMeanBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                     int* lengths, void* outputBuf, Nd4jLong* outputShape) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, gradLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+        }
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = gridDim.x * blockDim.x;
+
+        for (auto e = start; e < xLen; e += step) {
+
+            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
+            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
+            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto classIndex = y[yOffset];
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+
+            z[zOffset] = T(gradOut[gradOffsetO] / float(lengths[classIndex]));
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentMeanBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
+                                                  void* indicesBuf, Nd4jLong* indicesShape, int* lengths, void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
+                                                  Nd4jLong* inputOffsets, Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad, Nd4jLong* outOffsets) {
+        __shared__ T* x;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            yLen = shape::length(indicesShape);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+            currentLen = shape::length(outTad);
+        }
+        __syncthreads();
+
+        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
+//            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto segment = y[i]; //yIndex];
+            T* currentOut = z + outOffsets[i];
+            T* outGrad = gradOut + gradOutOffsets[segment];
+
+            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
+                auto zIndex = shape::getIndexOffset(e, outTad, currentLen);
+                auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen);
+                if (lengths[segment] > 0)
+                    currentOut[zIndex] = T(outGrad[gradIndex] / float(lengths[segment]));
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // backrop for mean
+    template <typename T, typename I>
+    int segmentMeanFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentMeanBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
+                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), lengths, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentMeanBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    gradOut->specialBuffer(), gradOut->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), lengths,
+                    output->specialBuffer(), output->specialShapeInfo(), inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // segmen mean bp main
+    int segmentMeanFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMeanFunctorBP_, (context, input,
+                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template int segmentMeanFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static int unsortedSegmentMeanFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentMeanBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
+                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), lengths, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentMeanBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    gradOut->specialBuffer(), gradOut->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), lengths,
+                    output->specialBuffer(), output->specialShapeInfo(), inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    int unsortedSegmentMeanFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMeanFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentMeanFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+
+}
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
new file mode 100644
index 000000000..b5c76e18d
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
@@ -0,0 +1,423 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author GS <sgazeos@gmail.com>
+//
+
+#include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
+#include <NDArrayFactory.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <PointersManager.h>
+#include <ConstantTadHelper.h>
+
+namespace nd4j {
+namespace ops {
+namespace helpers {
+    // -------------------------------------------------------------------------------------------------------------- //
+    // Segment ops linear kernels
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template<typename T, typename I>
+    static __global__ void
+    segmentMinLinearKernel(void *input, Nd4jLong *inputShape, int *starts, int *lengths, Nd4jLong numOfClasses,
+                           void *output, Nd4jLong *outputShape) {
+        __shared__
+        T *val;
+        __shared__
+        Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__
+        T *x;
+        __shared__
+        T *z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x / threadsPerSegment;
+            x = reinterpret_cast<T *>(input);
+            z = reinterpret_cast<T *>(output);
+            extern __shared__ unsigned char shmem[];
+            val = reinterpret_cast<T *>(shmem);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+            if (segment < numOfClasses) {
+                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                start = starts[segment];
+                finish = start + lengths[segment];
+                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                val[segment] = z[zIndex];
+            }
+
+        }
+        __syncthreads();
+
+        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
+            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template<typename T, typename I>
+    static __global__ void
+    unsortedSegmentMinLinearKernel(void *input, Nd4jLong *inputShape, void *indices, Nd4jLong *indicesShape,
+                                   int *starts, int *lengths, Nd4jLong numOfClasses, void *output,
+                                   Nd4jLong *outputShape) {
+        __shared__
+        T *val;
+        __shared__
+        Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__
+        T *x;
+        __shared__
+        T *z;
+        __shared__
+        I *y; //int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = blockIdx.x;
+            x = reinterpret_cast<T *>(input);
+            z = reinterpret_cast<T *>(output);
+            y = reinterpret_cast<I *>(indices);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            if (lengths[segment] > 0)
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+            else
+                z[zIndex] = DataTypeUtils::max<T>();
+
+        }
+        __syncthreads();
+        if (lengths[segment] > 0)
+            for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
+                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                if (y[yIndex] == segment) {
+                    nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+                }
+            }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+// SegmentMin kernel
+    template <typename T, typename I>
+    static __global__ void segmentMinTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
+        __shared__ T* val;
+        __shared__ Nd4jLong len, segment, zIndex, total;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = indices[blockIdx.x]; // / threadsPerSegment;
+            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
+            len = shape::length(inputTads);
+            start = starts[segment];
+            finish = start + lengths[segment];
+            total = shape::sizeAt(inputShape, 0);
+
+        }
+        __syncthreads();
+
+        auto idx = blockIdx.x;
+        if (blockIdx.x <= total) {
+            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
+            if (blockIdx.x == start) {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    z[zIndex] = x[xIndex];
+                }
+            }
+            else {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
+                }
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // segmen min
+    template <typename T, typename I>
+    static void segmentMinFunctor_(LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+        auto stream = context->getCudaStream();
+        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+        NDArray::prepareSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+        if (input->isVector()) {
+            segmentMinLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            segmentMinTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, &classesRangesBegs, &classesRangesLens});
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void segmentMinFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentMinFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template void segmentMinFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static void unsortedSegmentMinFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
+//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
+//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
+        fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+        NDArray::prepareSpecialUse({output}, {input, indices});
+        if (input->isVector()) {
+            unsortedSegmentMinLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            output->assign(DataTypeUtils::max<T>());
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            dims.x = input->sizeAt(0);
+            segmentMinTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices});
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void unsortedSegmentMinFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentMinFunctor_, (context, input, indices, numOfClasses, output),
+                              NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentMinFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+    template <typename T, typename I>
+    static __global__ void segmentMinBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
+                                                    Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                    void* outputBuf, Nd4jLong* outputShape) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, gradLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            gradIn = reinterpret_cast<T*>(forwardOutput);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+        }
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = gridDim.x * blockDim.x;
+
+        for (auto e = start; e < xLen; e += step) {
+
+            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
+            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
+            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto classIndex = y[yOffset];
+            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+
+            if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
+                z[zOffset] = gradOut[gradOffsetO];
+            }
+        }
+    }
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentMinBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
+                                                 Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                 void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
+                                                 Nd4jLong* inputOffsets, Nd4jLong* gradInTad, Nd4jLong* gradInOffsets,
+                                                 Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad,
+                                                 Nd4jLong* outOffsets) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            yLen = shape::length(indicesShape);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradIn = reinterpret_cast<T*>(forwardOutput);
+            gradLen = shape::length(epsShape);
+            currentLen = shape::length(outTad);
+        }
+
+        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
+            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto segment = y[yIndex];
+            T* current = x + inputOffsets[i];
+            T* currentOut = z + outOffsets[i];
+            T* in = gradIn + gradInOffsets[segment];
+            T* outGrad = gradOut + gradOutOffsets[segment];
+
+            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
+                if (nd4j::math::nd4j_abs(in[e] - current[e]) <= T(1.e-6))
+                    currentOut[e] = outGrad[e];
+            }
+        }
+    }
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    int segmentMinFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        //int numOfClasses = gradOut->sizeAt(0);
+        // if input is a vector: (as if in doc sample)
+        auto stream = context->getCudaStream();
+        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
+        segmentMinFunctor_<T, I>(context, input, indices, &tempRes);
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+
+            segmentMinBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
+            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentMinBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // segmen min
+    int segmentMinFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentMinFunctorBP_, (context, input,
+                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template int segmentMinFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static int unsortedSegmentMinFunctorBP_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        //int numOfClasses = gradOut->sizeAt(0);
+        // if input is a vector: (as if in doc sample)
+        auto stream = context->getCudaStream();
+        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
+        unsortedSegmentMinFunctor_<T, I>(context, input, indices, numOfClasses, &tempRes);
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut, &tempRes});
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentMinBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
+            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentMinBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut, &tempRes});
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    int unsortedSegmentMinFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentMinFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentMinFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+
+}
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
new file mode 100644
index 000000000..0a7c73040
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
@@ -0,0 +1,419 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author GS <sgazeos@gmail.com>
+//
+
+#include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
+#include <NDArrayFactory.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <PointersManager.h>
+#include <ConstantTadHelper.h>
+
+namespace nd4j {
+namespace ops {
+namespace helpers {
+    // -------------------------------------------------------------------------------------------------------------- //
+    // Segment Prod ops linear kernels
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static __global__ void segmentProdLinearKernel(void* input, Nd4jLong* inputShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
+        __shared__ T* val;
+        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__ T* x;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x / threadsPerSegment;
+            x = reinterpret_cast<T*>(input);
+            z = reinterpret_cast<T*>(output);
+            extern __shared__ unsigned char shmem[];
+            val = reinterpret_cast<T*>(shmem);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+            if (segment < numOfClasses) {
+                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                start = starts[segment];
+                finish = start + lengths[segment];
+                //val[segment] = ;
+                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                val[segment] = z[zIndex];
+            }
+
+        }
+        __syncthreads();
+//         auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+//         auto step = blockDim.x * gridDim.x;
+
+        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
+            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            nd4j::math::atomics::nd4j_atomicMul(&val[segment], x[xIndex]);
+        }
+        __syncthreads();
+
+        if (threadIdx.x == 0) {
+            z[zIndex] = val[segment];
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void unsortedSegmentProdLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
+        __shared__ T* val;
+        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__ T* x;
+        __shared__ T* z;
+        __shared__ I* y; //int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+//            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x;// / threadsPerSegment;
+            x = reinterpret_cast<T*>(input);
+            z = reinterpret_cast<T*>(output);
+            y = reinterpret_cast<I*>(indices);
+//            extern __shared__ unsigned char shmem[];
+//            val = reinterpret_cast<T*>(shmem);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+//            if (segment < numOfClasses) {
+            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            //start = starts[segment];
+            //finish = start + lengths[segment];
+            if (lengths[segment] > 0)
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+            else
+                z[zIndex] = 0; //DataTypeUtils::max<T>();
+//                val[segment] = z[zIndex];
+//            }
+
+        }
+        __syncthreads();
+        if (lengths[segment] > 0)
+            for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
+                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                if (y[yIndex] == segment && e != starts[segment]) {
+                    nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
+                }
+            }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // SegmentProd kernel
+    template <typename T, typename I>
+    static __global__ void segmentProdTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
+        __shared__ T* val;
+        __shared__ Nd4jLong len, segment, zIndex, total;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = indices[blockIdx.x]; // / threadsPerSegment;
+            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
+            len = shape::length(inputTads);
+            start = starts[segment];
+            finish = start + lengths[segment];
+            total = shape::sizeAt(inputShape, 0);
+
+        }
+        __syncthreads();
+
+        auto idx = blockIdx.x;
+        if (blockIdx.x <= total) {
+            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
+            if (blockIdx.x == start) {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    z[zIndex] = x[xIndex];
+                }
+            }
+            else {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
+                }
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static void segmentProdFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+        auto stream = context->getCudaStream();
+        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+
+        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            segmentProdLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            segmentProdTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void segmentProdFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), segmentProdFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template void segmentProdFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static void unsortedSegmentProdFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
+//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
+//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
+        fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            unsortedSegmentProdLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            output->assign(1);
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            dims.x = input->sizeAt(0);
+            segmentProdTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void unsortedSegmentProdFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentProdFunctor_, (context, input, indices, numOfClasses, output),
+                              FLOAT_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentProdFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentProdBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
+                                                     Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                     void* outputBuf, Nd4jLong* outputShape) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, gradLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            gradIn = reinterpret_cast<T*>(forwardOutput);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+        }
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = gridDim.x * blockDim.x;
+
+        for (auto e = start; e < xLen; e += step) {
+
+            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
+            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
+            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto classIndex = y[yOffset];
+            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+
+            z[zOffset] = gradOut[gradOffsetO]  * gradIn[gradOffsetI] / x[xOffset];
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentProdBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* forwardOutput,
+                                                  Nd4jLong* forwardShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                  void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
+                                                  Nd4jLong* inputOffsets, Nd4jLong* gradInTad, Nd4jLong* gradInOffsets,
+                                                  Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad,
+                                                  Nd4jLong* outOffsets) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            yLen = shape::length(indicesShape);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradIn = reinterpret_cast<T*>(forwardOutput);
+            gradLen = shape::length(epsShape);
+            currentLen = shape::length(outTad);
+        }
+
+        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
+            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto segment = y[yIndex];
+            T* current = x + inputOffsets[i];
+            T* currentOut = z + outOffsets[i];
+            T* in = gradIn + gradInOffsets[segment];
+            T* outGrad = gradOut + gradOutOffsets[segment];
+
+            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
+                currentOut[e] = outGrad[e] * in[e] / current[e];
+            }
+        }
+
+    }
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    int segmentProdFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
+        segmentProdFunctor_<T, I>(context, input, indices, &tempRes);
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        if (input->isVector()) {
+            Nd4jLong loopSize = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentProdBPLinearKernel<T,I><<<gradOut->lengthOf(), loopSize, 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
+            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentProdBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+        return Status::OK();
+    }
+
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    int segmentProdFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentProdFunctorBP_, (context, input,
+                indices, gradOut, output), FLOAT_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    BUILD_DOUBLE_TEMPLATE(template int segmentProdFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static int unsortedSegmentProdFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray tempRes(gradOut->ordering(), gradOut->getShapeAsVector(), DataTypeUtils::fromT<T>(), context);//->shapeInfo(), context);
+        unsortedSegmentProdFunctor_<T, I>(context, input, indices, numOfClasses, &tempRes);
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        if (input->isVector()) {
+            Nd4jLong loopSize = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentProdBPLinearKernel<T,I><<<gradOut->lengthOf(), loopSize, 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradInTads = packGradIn.specialShapeInfo();
+            Nd4jLong* gradInTadOffsets = packGradIn.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentProdBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    tempRes.specialBuffer(), tempRes.specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                    inputTads, inputTadOffsets, gradInTads, gradInTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+        return Status::OK();
+    }
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    int unsortedSegmentProdFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentProdFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentProdFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+
+    // -------------------------------------------------------------------------------------------------------------- //
+
+}
+}
+}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
new file mode 100644
index 000000000..6e3ab24d9
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
@@ -0,0 +1,280 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author GS <sgazeos@gmail.com>
+//
+
+#include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
+#include <NDArrayFactory.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <PointersManager.h>
+#include <ConstantTadHelper.h>
+
+namespace nd4j {
+namespace ops {
+namespace helpers {
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void unsortedSegmentSqrtNLinearKernel(void* input, Nd4jLong* inputShape, void* indices, Nd4jLong* indicesShape, int* starts, int* lengths, Nd4jLong numOfClasses, void* output, Nd4jLong* outputShape) {
+        __shared__ T* val;
+        __shared__ Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__ T* x;
+        __shared__ T* z;
+        __shared__ I* y; //int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+//            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x;// / threadsPerSegment;
+            x = reinterpret_cast<T*>(input);
+            z = reinterpret_cast<T*>(output);
+            y = reinterpret_cast<I*>(indices);
+//            extern __shared__ unsigned char shmem[];
+//            val = reinterpret_cast<T*>(shmem);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+//            if (segment < numOfClasses) {
+            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            //start = starts[segment];
+            //finish = start + lengths[segment];
+            if (lengths[segment] > 0)
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
+            else
+                z[zIndex] = 0; //DataTypeUtils::max<T>();
+//                val[segment] = z[zIndex];
+//            }
+
+        }
+        __syncthreads();
+        if (lengths[segment] > 0)
+            for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
+                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                if (y[yIndex] == segment && e != starts[segment]) {
+                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
+                }
+            }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // SegmentSqrtN kernel
+    template <typename T, typename I>
+    static __global__ void segmentSqrtNTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
+        __shared__ T* val;
+        __shared__ Nd4jLong len, segment, zIndex, total;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = indices[blockIdx.x]; // / threadsPerSegment;
+            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
+            len = shape::length(inputTads);
+            start = starts[segment];
+            finish = start + lengths[segment];
+            total = shape::sizeAt(inputShape, 0);
+
+        }
+        __syncthreads();
+
+        auto idx = blockIdx.x;
+        if (blockIdx.x <= total) {
+            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
+            if (blockIdx.x == start) {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    z[zIndex] = x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
+                }
+            }
+            else {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
+                }
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static void unsortedSegmentSqrtNFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
+//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numOfClasses, indices->lengthOf(), numOfClasses * 32 + 32);
+//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
+        fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            unsortedSegmentSqrtNLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            output->assign(0);
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            dims.x = input->sizeAt(0);
+            segmentSqrtNTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void unsortedSegmentSqrtNFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentSqrtNFunctor_, (context, input, indices, numOfClasses, output),
+                              FLOAT_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentSqrtNFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentSqrtNBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape, void* indicesBuf, Nd4jLong* indicesShape,
+                                                      int* lengths, void* outputBuf, Nd4jLong* outputShape) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, gradLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+        }
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = gridDim.x * blockDim.x;
+
+        for (auto e = start; e < xLen; e += step) {
+
+            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
+            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
+            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto classIndex = y[yOffset];
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+
+            z[zOffset] = T(gradOut[gradOffsetO] / math::nd4j_sqrt<int, float>(lengths[classIndex]));
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static __global__ void segmentSqrtNBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
+                                                   void* indicesBuf, Nd4jLong* indicesShape, int* lengths, void* outputBuf, Nd4jLong* outputShape,Nd4jLong* inputTad,
+                                                   Nd4jLong* inputOffsets, Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad, Nd4jLong* outOffsets) {
+        __shared__ T* x;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            yLen = shape::length(indicesShape);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+            currentLen = shape::length(outTad);
+        }
+        __syncthreads();
+
+        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
+//            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto segment = y[i]; //yIndex];
+            T* currentOut = z + outOffsets[i];
+            T* outGrad = gradOut + gradOutOffsets[segment];
+
+            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
+                auto zIndex = shape::getIndexOffset(e, outTad, currentLen);
+                auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen);
+                if (lengths[segment] > 0)
+                    currentOut[zIndex] = T(outGrad[gradIndex] / math::nd4j_sqrt<int, float>(lengths[segment]));
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static int unsortedSegmentSqrtNFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        auto numClasses = indices->e<int>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentSqrtNBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
+                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), lengths, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+//            auto packGradIn = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(tempRes.getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentSqrtNBPTadKernel<T,I><<<indices->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    gradOut->specialBuffer(), gradOut->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), lengths,
+                    output->specialBuffer(), output->specialShapeInfo(), inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    int unsortedSegmentSqrtNFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentSqrtNFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), FLOAT_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentSqrtNFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), FLOAT_TYPES, INTEGER_TYPES);
+
+}
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
new file mode 100644
index 000000000..4f2cc92a1
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
@@ -0,0 +1,393 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author GS <sgazeos@gmail.com>
+//
+
+#include <ops/declarable/helpers/segment.h>
+#include <ops/declarable/helpers/segment_common.h>
+#include <NDArrayFactory.h>
+#include <helpers/ShapeUtils.h>
+#include <helpers/TAD.h>
+#include <exceptions/cuda_exception.h>
+#include <PointersManager.h>
+#include <ConstantTadHelper.h>
+
+namespace nd4j {
+namespace ops {
+namespace helpers {
+    // -------------------------------------------------------------------------------------------------------------- //
+    // Segment ops linear kernels
+    // -------------------------------------------------------------------------------------------------------------- //
+    template<typename T, typename I>
+    static __global__ void
+    segmentSumLinearKernel(void *input, Nd4jLong *inputShape, int *starts, int *lengths, Nd4jLong numOfClasses,
+                           void *output, Nd4jLong *outputShape) {
+        __shared__
+        T *val;
+        __shared__
+        Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__
+        T *x;
+        __shared__
+        T *z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            threadsPerSegment = (gridDim.x + numOfClasses - 1) / numOfClasses;
+            segment = blockIdx.x / threadsPerSegment;
+            x = reinterpret_cast<T *>(input);
+            z = reinterpret_cast<T *>(output);
+
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+
+            if (segment < numOfClasses) {
+                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                start = starts[segment];
+                finish = start + lengths[segment];
+                //val[segment] = ;
+                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+            }
+
+        }
+        __syncthreads();
+
+        for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
+            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template<typename T, typename I>
+    static __global__ void
+    unsortedSegmentSumLinearKernel(void *input, Nd4jLong *inputShape, void *indices, Nd4jLong *indicesShape,
+                                   int *starts, int *lengths, Nd4jLong numOfClasses, void *output,
+                                   Nd4jLong *outputShape) {
+        __shared__
+        T *val;
+        __shared__
+        Nd4jLong xLen, zLen, segment, zIndex;
+        __shared__
+        T *x;
+        __shared__
+        T *z;
+        __shared__
+        I *y; //int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = blockIdx.x;
+            x = reinterpret_cast<T *>(input);
+            z = reinterpret_cast<T *>(output);
+            y = reinterpret_cast<I *>(indices);
+            xLen = shape::length(inputShape);
+            zLen = shape::length(outputShape);
+
+            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            if (lengths[segment] > 0)
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+            else
+                z[zIndex] = 0; //DataTypeUtils::max<T>();
+        }
+        __syncthreads();
+
+        if (lengths[segment] > 0)
+            for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
+                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                if (y[yIndex] == segment && e != starts[segment]) {
+                    nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+                }
+            }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    // SegmentSum kernel
+    template <typename T, typename I>
+    static __global__ void segmentSumTadKernel(void* inputBuf, Nd4jLong* inputShape, Nd4jLong* inputTads, Nd4jLong* inputTadOffsets, I* indices, int* starts, int* lengths, Nd4jLong numOfClasses, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* outputTads, Nd4jLong* outputTadOffsets) {
+        __shared__ T* val;
+        __shared__ Nd4jLong len, segment, zIndex, total;
+        __shared__ T* z;
+        __shared__ int threadsPerSegment, start, finish;
+
+        if (threadIdx.x == 0) {
+            segment = indices[blockIdx.x]; // / threadsPerSegment;
+            z = reinterpret_cast<T*>(outputBuf) + outputTadOffsets[segment];
+            len = shape::length(inputTads);
+            start = starts[segment];
+            finish = start + lengths[segment];
+            total = shape::sizeAt(inputShape, 0);
+
+        }
+        __syncthreads();
+
+        auto idx = blockIdx.x;
+        if (blockIdx.x <= total) {
+            auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
+            if (blockIdx.x == start) {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    z[zIndex] = x[xIndex];
+                }
+            }
+            else {
+                for (auto e = threadIdx.x; e < len; e += blockDim.x) {
+                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
+                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    if (lengths[segment])
+                        nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
+                }
+            }
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static void segmentSumFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output) {
+        auto stream = context->getCudaStream();
+        Nd4jLong numClasses = indices->e<Nd4jLong>(indices->lengthOf() - 1) + 1;
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numClasses});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numClasses});
+
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+
+        dim3 dims(numClasses, indices->lengthOf(), numClasses * 32 + 32);
+        fillUpSegments(indices, numClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            segmentSumLinearKernel<T,I><<<numClasses, input->lengthOf(), numClasses * 32 + 32, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            segmentSumTadKernel<T,I><<<input->sizeAt(0), 512, 2048, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void segmentSumFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), segmentSumFunctor_, (context, input, indices, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template void segmentSumFunctor_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static void unsortedSegmentSumFunctor_(nd4j::LaunchContext* context, NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+//        NDArray classes = NDArrayFactory::create<int>('c', {numOfClasses, 2});
+        NDArray classesRangesBegs = NDArrayFactory::create<int>('c', {numOfClasses});
+        NDArray classesRangesLens = NDArrayFactory::create<int>('c', {numOfClasses});
+//        NDArray row = NDArrayFactory::create<int>('c', {1, 2}, {(int)indices->lengthOf(), (int)0});
+//        classes.applyTrueBroadcast(nd4j::BroadcastOpsTuple::Assign(), &row, &classes);
+        classesRangesBegs.assign(indices->lengthOf());
+        classesRangesLens.assign(0);
+        dim3 dims(numOfClasses, indices->lengthOf(), (numOfClasses + 1) * 64);
+//        int* classesBuf = reinterpret_cast<int*>(classes.specialBuffer());
+        fillUpSegments(indices, numOfClasses, classesRangesBegs, classesRangesLens);
+        int* begins = reinterpret_cast<int*>(classesRangesBegs.specialBuffer());
+        int* lengths = reinterpret_cast<int*>(classesRangesLens.specialBuffer());
+
+        if (input->isVector()) {
+            unsortedSegmentSumLinearKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), indices->specialBuffer(), indices->specialShapeInfo(), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            output->assign(0);
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            dims.x = input->sizeAt(0);
+            segmentSumTadKernel<T,I><<<dims.x, dims.y, dims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), inputTads, inputTadOffsets, reinterpret_cast<I*>(indices->specialBuffer()), begins, lengths, numOfClasses, output->specialBuffer(), output->specialShapeInfo(), outputTads, outputTadOffsets);
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    void unsortedSegmentSumFunctor(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(input->dataType(), indices->dataType(), unsortedSegmentSumFunctor_, (context, input, indices, numOfClasses, output),
+                              NUMERIC_TYPES, INTEGER_TYPES);
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template void unsortedSegmentSumFunctor_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+
+    // -------------------------------------------------------------------------------------------------------------- //
+    // Backpropagate ops
+    // -------------------------------------------------------------------------------------------------------------- //
+    // Sorted sum backpropagate
+    template <typename T, typename I>
+    static __global__ void segmentSumBPLinearKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
+                                                    void* indicesBuf, Nd4jLong* indicesShape, void* outputBuf, Nd4jLong* outputShape) {
+        __shared__ T* x;
+        __shared__ T* gradIn;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, gradLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+        }
+
+        auto start = blockIdx.x * blockDim.x + threadIdx.x;
+        auto step = gridDim.x * blockDim.x;
+
+        for (auto e = start; e < xLen; e += step) {
+
+            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
+            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
+            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto classIndex = y[yOffset];
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+
+            z[zOffset] = gradOut[gradOffsetO];
+        }
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    static __global__ void segmentSumBPTadKernel(void* inputBuf, Nd4jLong* inputShape, void* eps, Nd4jLong* epsShape,
+                                                 void* indicesBuf, Nd4jLong* indicesShape, void* outputBuf, Nd4jLong* outputShape, Nd4jLong* inputTad,
+                                                 Nd4jLong* inputOffsets, Nd4jLong* gradOutTad, Nd4jLong* gradOutOffsets, Nd4jLong* outTad, Nd4jLong* outOffsets) {
+        __shared__ T* x;
+        __shared__ T* gradOut;
+        __shared__ I* y;
+        __shared__ T* z;
+        __shared__ Nd4jLong xLen, yLen, gradLen, currentLen;
+
+        if (threadIdx.x == 0) {
+            xLen = shape::length(inputShape);
+            x = reinterpret_cast<T*>(inputBuf);
+            y = reinterpret_cast<I*>(indicesBuf);
+            z = reinterpret_cast<T*>(outputBuf);
+            yLen = shape::length(indicesShape);
+            gradOut = reinterpret_cast<T*>(eps);
+            gradLen = shape::length(epsShape);
+            currentLen = shape::length(outTad);
+        }
+
+        for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
+            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto segment = y[yIndex];
+            T* currentOut = z + outOffsets[i];
+            T* outGrad = gradOut + gradOutOffsets[segment];
+
+            for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
+                currentOut[e] = outGrad[e];
+            }
+        }
+
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    template <typename T, typename I>
+    int segmentSumFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentSumBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
+                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentSumBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                    inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    int segmentSumFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return segmentSumFunctorBP_, (context, input,
+                indices, gradOut, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    BUILD_DOUBLE_TEMPLATE(template int segmentSumFunctorBP_, (nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+    // -------------------------------------------------------------------------------------------------------------- //
+
+    template <typename T, typename I>
+    static int unsortedSegmentSumFunctorBP_(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({output}, {input, indices, gradOut});
+        if (input->isVector()) {
+            Nd4jLong loop_size = input->lengthOf();
+            auto numOfClasses = gradOut->lengthOf(); //indices->e<Nd4jLong>(loop_size - 1);
+            segmentSumBPLinearKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(),
+                    input->specialShapeInfo(), gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo());
+        }
+        else {
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input->rankOf(), {0});
+            auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions);
+            auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions);
+            auto packGradOut = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(gradOut->getShapeInfo(), dimensions);
+            Nd4jLong* inputTads = packX.specialShapeInfo();
+            Nd4jLong* inputTadOffsets = packX.specialOffsets();
+            Nd4jLong* outputTads = packZ.specialShapeInfo();
+            Nd4jLong* outputTadOffsets = packZ.specialOffsets();
+            Nd4jLong* gradOutTads = packGradOut.specialShapeInfo();
+            Nd4jLong* gradOutTadOffsets = packGradOut.specialOffsets();
+
+            segmentSumBPTadKernel<T,I><<<gradOut->lengthOf(), input->lengthOf(), 256, *stream>>>(input->specialBuffer(), input->specialShapeInfo(),
+                    gradOut->specialBuffer(), gradOut->specialShapeInfo(),
+                    indices->specialBuffer(), indices->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(),
+                    inputTads, inputTadOffsets, gradOutTads, gradOutTadOffsets,
+                    outputTads, outputTadOffsets);
+        }
+        NDArray::registerSpecialUse({output}, {input, indices, gradOut});
+        return Status::OK();
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    int unsortedSegmentSumFunctorBP(nd4j::LaunchContext* context , NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output) {
+        BUILD_DOUBLE_SELECTOR(output->dataType(), indices->dataType(), return unsortedSegmentSumFunctorBP_, (context, input, indices, gradOut, numOfClasses, output), NUMERIC_TYPES, INTEGER_TYPES);
+    }
+    // -------------------------------------------------------------------------------------------------------------- //
+    BUILD_DOUBLE_TEMPLATE(template int unsortedSegmentSumFunctorBP_, (nd4j::LaunchContext* context, NDArray* input, NDArray* indices, NDArray* gradOut, Nd4jLong numOfClasses, NDArray* output), NUMERIC_TYPES, INTEGER_TYPES);
+
+}
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
index 29cf08544..411d7eac1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
@@ -24,16 +24,40 @@ namespace nd4j {
 namespace ops {
 namespace helpers {
 
-    template <typename T>
-    static void sequenceMask_(NDArray* input, NDArray* output, int maxIndex) {
-        //
+    template <typename I, typename B>
+    static __global__ void sequenceMaskKernel(void* inputBuf, Nd4jLong* inputShape, void* outputBuf, Nd4jLong* outputShape, int maxIndex) {
+
+        __shared__ I* input;
+        __shared__ B* output;
+        __shared__ Nd4jLong inputLen, outputLen;
+        if (threadIdx.x == 0) {
+            input = reinterpret_cast<I*>(inputBuf);
+            output = reinterpret_cast<B*>(outputBuf);
+            inputLen = shape::length(inputShape);
+            outputLen = shape::length(outputShape);
+        }
+
+        for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
+            for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
+                if (i < input[shape::getIndexOffset(k, inputShape, inputLen)])
+                    output[shape::getIndexOffset(k * maxIndex + i, outputShape, outputLen)] = B(true);
+
+    }
+
+    template <typename I, typename B>
+    static void sequenceMask_(LaunchContext* context, NDArray* input, NDArray* output, int maxIndex) {
+        dim3 launchDims(maxIndex, input->lengthOf(), 128);
+        NDArray::prepareSpecialUse({output}, {input});
+        auto stream = context->getCudaStream();
+        sequenceMaskKernel<I, B><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), maxIndex);
+        NDArray::registerSpecialUse({output}, {input});
     }
 
     void sequenceMask(nd4j::LaunchContext * context, NDArray* input, NDArray* output, int maxIndex) {
-        BUILD_SINGLE_SELECTOR(input->dataType(), sequenceMask_, (input, output, maxIndex), LIBND4J_TYPES);
+        BUILD_DOUBLE_SELECTOR(input->dataType(), output->dataType(), sequenceMask_, (context, input, output, maxIndex), INTEGER_TYPES, BOOL_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void sequenceMask_, (NDArray* input, NDArray* output, int maxIndex), LIBND4J_TYPES);
+    BUILD_DOUBLE_TEMPLATE(template void sequenceMask_, (nd4j::LaunchContext* context, NDArray* input, NDArray* output, int maxIndex), INTEGER_TYPES, BOOL_TYPES);
 }
 }
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 5f57f944f..a7b756bc5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -456,6 +456,102 @@ void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArr
     manager.synchronize();
 }
 
+///////////////////////////////////////////////////////////////////
+template<typename T>
+__global__ static void scatterUpdateCuda(const int opCode, const int numOfInd,
+                                              void* vx, const Nd4jLong *xShapeInfo, const Nd4jLong *xOffsets,
+                                              void* vy, const Nd4jLong *yShapeInfo, const Nd4jLong *yOffsets,
+                                              const int* indexes) {
+
+    __shared__ T *x, *y;
+    __shared__ Nd4jLong arrLenX, arrLenY;
+
+    for (int e = 0; e < numOfInd; e++ ) {
+
+        const auto xIndex = indexes[e];
+        const bool isOwner = xIndex < gridDim.x ? blockIdx.x == xIndex : blockIdx.x == xIndex % gridDim.x;
+
+        if (!isOwner)
+            continue;
+
+        if (threadIdx.x == 0) {
+            x = reinterpret_cast<T*>(vx) + xOffsets[xIndex];
+            y = reinterpret_cast<T*>(vy) + yOffsets[e];
+            arrLenX = shape::length(xShapeInfo);
+            arrLenY = shape::length(yShapeInfo);
+        }
+
+        __syncthreads();
+
+        if (arrLenX != arrLenY)
+            return;
+
+        for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
+
+            const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX);
+            const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY);
+
+            switch (opCode) {
+                case 0:
+                    x[xOffset] += y[yOffset];
+                    break;
+                case 1:
+                    x[xOffset] -= y[yOffset];
+                    break;
+                case 2:
+                    x[xOffset] *= y[yOffset];
+                    break;
+                case 3:
+                    x[xOffset] /= y[yOffset];
+                    break;
+                case 4:
+                    x[xOffset] = y[yOffset] - x[xOffset];
+                    break;
+                case 5:
+                    x[xOffset] = y[yOffset] / x[xOffset];
+                    break;
+                case 6:
+                    x[xOffset] = y[yOffset];
+                    break;
+                default:
+                    continue;
+            }
+        }
+        __syncthreads();
+    }
+}
+
+template<typename T>
+__host__ static void scatterUpdateCudaLauncher(const cudaStream_t* stream, const int opCode, const int numOfInd, void* vx, const Nd4jLong *xShapeInfo, const Nd4jLong *xOffsets, void* vy, const Nd4jLong *yShapeInfo, const Nd4jLong *yOffsets, const int* indexes) {
+
+    scatterUpdateCuda<T><<<512, 256, MAX_NUM_THREADS, *stream>>>(opCode, numOfInd, vx, xShapeInfo, xOffsets, vy, yShapeInfo, yOffsets, indexes);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+void scatterUpdate(nd4j::LaunchContext* context, NDArray& input, NDArray& updates, const std::vector<int>* intArgs) {
+
+    const int opCode    = (*intArgs)[0];
+    const int numOfDims = (*intArgs)[1];
+    const int numOfInd  = (*intArgs)[2 + numOfDims];
+
+    std::vector<int> tadDimensions(numOfDims);
+    for (int e = 2; e < 2 + numOfDims; e++)
+        tadDimensions[e-2] = (*intArgs)[e];
+
+    auto packX = ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), tadDimensions);
+    auto packY = ConstantTadHelper::getInstance()->tadForDimensions(updates.getShapeInfo(), tadDimensions);
+
+    NDArray indices(const_cast<int*>(intArgs->data()) + numOfDims + 3, 'c', {numOfInd}, nd4j::DataType::INT32, context);
+
+    PointersManager manager(context, "scatterUpdate");
+
+    NDArray::prepareSpecialUse({&input}, {&input, &updates, &indices});
+    BUILD_SINGLE_SELECTOR(input.dataType(), scatterUpdateCudaLauncher, (context->getCudaStream(), opCode, numOfInd, input.specialBuffer(), packX.platformShapeInfo(), packX.platformOffsets(), updates.specialBuffer(), packY.platformShapeInfo(), packY.platformOffsets(), reinterpret_cast<int*>(indices.getSpecialBuffer())), LIBND4J_TYPES);
+    NDArray::registerSpecialUse({&input}, {&input, &updates, &indices});
+
+    manager.synchronize();
+}
 
 
 
@@ -466,17 +562,140 @@ void tileBP(nd4j::LaunchContext * context, const NDArray& gradO /*input*/, NDArr
 
 
 
+
+
+
+
+
+
+
+
+
+
+
+
+    template <typename T>
+    static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, Nd4jLong len, nd4j::graph::RandomGenerator* rng) {
+        auto tid = blockIdx.x * blockDim.x;
+        auto step = blockDim.x * gridDim.x;
+
+        for (int i = firstDim - 1 - tid - threadIdx.x; i > 0; i -= step) {
+            int r = rng->relativeInt(i) % i;
+            if (i != r) {
+                T e0 = input[shape::getIndexOffset(i, shape, len)];
+                T e1 = input[shape::getIndexOffset(r, shape, len)];
+                //math::nd4j_swap<T>(input(i), input(r));
+                input[shape::getIndexOffset(i, shape, len)] = e1;
+                input[shape::getIndexOffset(r, shape, len)] = e0;
+            }
+        }
+    }
+    template <typename T>
+    static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, Nd4jLong len, int* indices, nd4j::graph::RandomGenerator* rng) {
+
+//        PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+        auto tid = blockIdx.x * blockDim.x;
+        auto step = blockDim.x * gridDim.x;
+
+        for(int i = firstDim - 1 - tid - threadIdx.x; i > 0; i -= step) {
+            int r = rng->relativeInt(i) % i;
+            output[shape::getIndexOffset(i, outputShape, len)] = input[shape::getIndexOffset(indices[r], inputShape, len)];
+            if(i != r) {
+                output[shape::getIndexOffset(r, outputShape, len)] = input[shape::getIndexOffset(indices[i], inputShape, len)];
+//                output.p(r, input.e<T>(indices[i]));
+//                math::nd4j_swap<int>(indices[i], indices[r]);
+                atomicExch(&indices[i], indices[r]);
+            }
+        }
+
+    }
     //////////////////////////////////////////////////////////////////////////
     template <typename T>
-    void randomShuffle_(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace) {
+    void randomShuffle_(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
+
+        // check edge cases first
+        int temp;
+        const int firstDim = input.sizeAt(0);
+        auto stream = context->getCudaStream();
+        NDArray::prepareSpecialUse({&output}, {&input});
+        if(input.lengthOf() == 1 || firstDim == 1) {
+            if(!isInplace)
+                output.assign(input);
+        }
+        else if (input.isVector() || shape::isLikeVector(input.getShapeInfo(), temp)) {
+
+            // apply Fisher-Yates shuffle
+            nd4j::graph::RandomGenerator* dRandom = nullptr;
+            cudaMalloc(&dRandom, sizeof(nd4j::graph::RandomGenerator));
+            cudaMemcpy(dRandom, &rng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
+            T* inputBuf = reinterpret_cast<T*>(input.specialBuffer());
+            if(isInplace) {
+                swapShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), firstDim, input.lengthOf(), dRandom);
+            }
+            else {
+                std::vector<int> indices(firstDim);
+                std::iota(indices.begin(), indices.end(), 0);
+                cudaMemcpy(output.specialBuffer(), input.specialBuffer(), sizeof(T), cudaMemcpyDeviceToDevice);
+                //output.p<T>(Nd4jLong(0), input.e<T>(0));
+                PointersManager pointersManager(context, "helper::randomShuffle_");
+                int* indicesDev = reinterpret_cast<int*>(pointersManager.replicatePointer(indices.data(), indices.size() * sizeof(int)));
+                T* outputBuf = reinterpret_cast<T*>(output.specialBuffer());
+                fillShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), outputBuf, output.specialShapeInfo(), firstDim, input.lengthOf(), indicesDev, dRandom);
+                pointersManager.synchronize();
+            }
+//            rng.rewindH(firstDim - 1);
+            cudaFree(dRandom);
+        }
+        else {
+
+            // evaluate sub-arrays list of input array through all dimensions excluding first one
+            std::vector<int> dimensions = ShapeUtils::evalDimsToExclude(input.rankOf(), {0});
+            auto subArrsListIn = input.allTensorsAlongDimension(dimensions);
+
+            // apply Fisher-Yates shuffle
+            if(isInplace) {
+                PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->elementwiseThreshold())
+                for(int i = firstDim - 1; i > 0; --i) {
+                    int r = rng.relativeInt(i) % i;
+
+                    if(i != r)
+                        subArrsListIn->at(i)->swapUnsafe(*subArrsListIn->at(r));
+                }
+            }
+            else {
+                // evaluate sub-arrays list of output array through all dimensions excluding first one
+                auto subArrsListOut = output.allTensorsAlongDimension(dimensions);
+                std::vector<int> indices(firstDim);
+                std::iota(indices.begin(), indices.end(), 0);
+                bool isZeroShuffled = false;
+                PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
+                for(int i = firstDim - 1; i > 0; --i) {
+                    int r = rng.relativeInt(i) % i;
+                    subArrsListOut->at(i)->assign(subArrsListIn->at(indices[r]));
+                    if(r == 0)
+                        isZeroShuffled = true;
+
+                    if(i != r) {
+                        subArrsListOut->at(r)->assign(subArrsListIn->at(indices[i]));
+                        math::nd4j_swap<int>(indices[i], indices[r]);
+                    }
+                }
+                if(!isZeroShuffled)
+                    subArrsListOut->at(0)->assign(subArrsListIn->at(0));
+                delete subArrsListOut;
+            }
+            rng.rewindH(firstDim-1);
+            delete subArrsListIn;
+        }
+        NDArray::registerSpecialUse({&output}, {&input});
 
     }
 
-    void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace) {
+    void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace) {
         BUILD_SINGLE_SELECTOR(input.dataType(), randomShuffle_, (context, input, output, rng, isInplace), LIBND4J_TYPES);
     }
 
-    BUILD_SINGLE_TEMPLATE(template void randomShuffle_, (nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace), LIBND4J_TYPES);
+    BUILD_SINGLE_TEMPLATE(template void randomShuffle_, (nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace), LIBND4J_TYPES);
 
     ////////////////////////////////////////////////////////////////////////
     template<typename T>
@@ -498,11 +717,6 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
     output.setIdentity();
 }
 
-    //////////////////////////////////////////////////////////////////////////
-    void scatterUpdate(nd4j::LaunchContext * context, NDArray& operand, NDArray& updates, const std::vector<int>* intArgs) {
-
-    }
-
     //////////////////////////////////////////////////////////////////////////
     template <typename T, typename Z>
     static __global__ void global_mergeMaxIndex_(void **inArrs, void **inShapes, const int numArrays, void *voutput, Nd4jLong *outputShape, Nd4jLong length) {
diff --git a/libnd4j/include/ops/declarable/helpers/gru.h b/libnd4j/include/ops/declarable/helpers/gru.h
index 3a58ea1a0..87e1786fb 100644
--- a/libnd4j/include/ops/declarable/helpers/gru.h
+++ b/libnd4j/include/ops/declarable/helpers/gru.h
@@ -33,31 +33,11 @@ namespace helpers {
 
 	void gruTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, NDArray* h);
 
-	void gruCellBP(nd4j::LaunchContext * context, const NDArray* x, const NDArray* h0, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* dLdh, const NDArray* dLdWx0,
-                  const NDArray* dLdWh0, const NDArray* dLdb0, NDArray* dLdx, NDArray* dLdh0, NDArray* dLdWx, NDArray* dLdWh, NDArray* dLdb);
-
-
-//////////////////////////////////////////////////////////////////////////
-FORCEINLINE NDArray sigmoid(const NDArray& arr) {
-    return (const_cast<NDArray&>(arr)).transform(transform::Sigmoid);
-}
-
-FORCEINLINE void sigmoidInplace(const NDArray& arr) {
-    (const_cast<NDArray&>(arr)).applyTransform(transform::Sigmoid);
-}
-
-//////////////////////////////////////////////////////////////////////////
-FORCEINLINE NDArray tanh(const NDArray& arr) {
-    return (const_cast<NDArray&>(arr)).transform(transform::Tanh);
-}
-
-FORCEINLINE void tanhInplace(const NDArray& arr) {
-    (const_cast<NDArray&>(arr)).applyTransform(transform::Tanh);
-}
+	void gruCellBP(nd4j::LaunchContext* context, const NDArray* x, const NDArray* hLast, const NDArray* W, const NDArray* Wc, const NDArray* b, const NDArray* bc, const NDArray* dLdr, const NDArray* dLdu, const NDArray* dLdc, const NDArray* dLdh, NDArray* dLdx, NDArray* dLdhLast, NDArray* dLdW, NDArray* dLdWc, NDArray* dLdb, NDArray* dLdbc);
 
 }
 }
 }
 
 
-#endif //LIBND4J_GRU_H
+#endif //LIBND4J_GRU_H
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/hashcode.h b/libnd4j/include/ops/declarable/helpers/hashcode.h
index d1e3c7325..a7fa5bc83 100644
--- a/libnd4j/include/ops/declarable/helpers/hashcode.h
+++ b/libnd4j/include/ops/declarable/helpers/hashcode.h
@@ -27,37 +27,37 @@ namespace nd4j {
     namespace ops {
         namespace helpers {
             template <typename T>
-            FORCEINLINE Nd4jLong longBytes(T value);
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(T value);
 
             template <>
-            FORCEINLINE Nd4jLong longBytes(float value) {
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(float value) {
                 int intie = *(int *)&value;
                 return static_cast<Nd4jLong>(intie);
             }
 
             template <>
-            FORCEINLINE Nd4jLong longBytes(double value) {
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(double value) {
                 Nd4jLong longie = *(Nd4jLong *)&value;
                 return longie;
             }
 
             template <>
-            FORCEINLINE Nd4jLong longBytes(float16 value) {
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(float16 value) {
                 return longBytes<float>((float) value);
             }
 
             template <>
-            FORCEINLINE Nd4jLong longBytes(Nd4jLong value) {
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(Nd4jLong value) {
                 return value;
             }
 
             template <>
-            FORCEINLINE Nd4jLong longBytes(bfloat16 value) {
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(bfloat16 value) {
                 return longBytes<float>((float) value);
             }
 
             template <typename T>
-            FORCEINLINE Nd4jLong longBytes(T value) {
+            FORCEINLINE _CUDA_HD Nd4jLong longBytes(T value) {
                 return longBytes<Nd4jLong>((Nd4jLong) value);
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp b/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp
index 6e96a7ac6..179c7efab 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/rnn.cpp
@@ -30,51 +30,44 @@ namespace helpers {
 
 
 //////////////////////////////////////////////////////////////////////////
-static FORCEINLINE NDArray activation(const NDArray& arr) {
+void rnnCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* hPrev, NDArray* ht) {
 
-    return (const_cast<NDArray&>(arr)).transform(transform::Tanh);
+    // xt    input [bS x iS]
+    // Wx    input-to-hidden weights, [iS  x nU]
+    // Wh    hidden-to-hidden weights, [nU x nU]
+    // b     biases, [2*nU]: {0, nU} are input-to-hidden biases and {nU, 2*nU} are hidden-to-hidden biases
+    // hPrev previous cell output [bS x nU],  that is at previous time step t-1, in case of projection=false -> nU=nU!!!
+
+    const int nU = hPrev->sizeAt(1);
+
+    // ht is current cell output [bS x nU], that is at current time step t
+    ht->assign(mmul(*xt, *Wx) + (*b)({{0, nU}})  +  mmul(*hPrev, *Wh) + (*b)({{nU, 2*nU}}));     // [bS x nU] + [nU]  +  [bS x nU] + [nU] = [bS x nU]
+    ht->applyTransform(transform::Tanh);
 }
 
-
-//////////////////////////////////////////////////////////////////////////
-void rnnCell(nd4j::LaunchContext * context, const NDArray* xt, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* ht_1, NDArray* ht) {
-
-    // xt   input [bS x inSize]
-    // Wx   input-to-hidden weights, [inSize  x numUnits]
-    // Wh   hidden-to-hidden weights, [numUnits x numUnits]
-    // b    biases, [2*numUnits]: {0, numUnits} are input-to-hidden biases and {numUnits, 2*numUnits} are hidden-to-hidden biases
-    // ht_1 previous cell output [bS x numUnits],  that is at previous time step t-1, in case of projection=false -> numUnits=numUnits!!!
-
-    const int numUnits  = ht_1->sizeAt(1);
-    
-    // ht is current cell output [bS x numUnits], that is at current time step t                
-    ht->assign(activation(mmul(*xt, *Wx) + (*b)({{0, numUnits}})  +  mmul(*ht_1, *Wh) + (*b)({{numUnits, 2*numUnits}})));     // [bS x numUnits] + [numUnits]  +  [bS x numUnits] + [numUnits] = [bS x numUnits]
-}
-
-
 //////////////////////////////////////////////////////////////////////////
 void rnnTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray* Wx, const NDArray* Wh, const NDArray* b, const NDArray* h0, const NDArray* maxTimeStep, NDArray* h, NDArray* hFinal) {
 
-    // x   input [time x bS x inSize]
-	// Wx  input-to-hidden  weights, [inSize  x numUnits]
-    // Wh  hidden-to-hidden weights, [numUnits x numUnits]
-	// b   biases for, [2*numUnits]
+    // x   input [time x bS x iS]
+	// Wx  input-to-hidden  weights, [iS  x nU]
+    // Wh  hidden-to-hidden weights, [nU x nU]
+	// b   biases for, [2*nU]
 
-	// h0          initial cell output (at time step = 0) [bS x numUnits]
+	// h0          initial cell output (at time step = 0) [bS x nU]
 	// maxTimeStep vector [bS] containing integer values within [0,time), each element of this vector set max time step per each input in batch, this means there are no calculations for time >= maxTimeStep
-    
-    const int time     = x->sizeAt(0);
-    const int bS       = x->sizeAt(1);        
-    
+
+    const int time = x->sizeAt(0);
+    const int bS   = x->sizeAt(1);
+
     // at first time step
     if(h0)
         hFinal->assign(h0);
-    else 
-        *hFinal = 0.;   
+    else
+        *hFinal = 0.;
 
     BlasHelper::getInstance();          // to avoid memory leak in pragma parallel loops
-    // loop through batch of inputs           
-    for (int e = 0; e < bS; ++e) {                  
+    // loop through batch of inputs
+    for (int e = 0; e < bS; ++e) {
         // loop through time steps
         for (int t = 0; t < time; ++t) {
 
@@ -82,19 +75,19 @@ void rnnTimeLoop(nd4j::LaunchContext * context, const NDArray* x, const NDArray*
 
             auto xt   = (*x)({t,t+1, e,e+1, 0,0}, true);
             auto ht   = (*h)({t,t+1, e,e+1, 0,0}, true);
-            auto ht_1 = (*hFinal)({e,e+1, 0,0}, true);                       // previous state
-            
+            auto hPrev = (*hFinal)({e,e+1, 0,0}, true);                       // previous state
+
             if(t >= maxStep) {
                 ht = 0.;
-                if(maxStep != 0)                    
-                    ht_1.assign((*h)({maxStep-1,maxStep, e,e+1, 0,0}));
+                if(maxStep != 0)
+                    hPrev.assign((*h)({maxStep-1,maxStep, e,e+1, 0,0}));
             }
             else {
-                helpers::rnnCell(context, &xt, Wx, Wh, b, &ht_1, &ht);
-                ht_1.assign(ht);
+                helpers::rnnCell(context, &xt, Wx, Wh, b, &hPrev, &ht);
+                hPrev.assign(ht);
             }
         }
-    }    
+    }
 }
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/segment_common.h b/libnd4j/include/ops/declarable/helpers/segment_common.h
new file mode 100644
index 000000000..2d0ec5b8b
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/segment_common.h
@@ -0,0 +1,36 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+//  @author sgazeos@gmail.com
+//  @brief helpers common fuctions for segment_* ops (segment_max, segment_min, etc.)
+//  @brief helpers common fuctions for unsorted_segment_* ops (unsorted_segment_max, etc.)
+//
+#ifndef __SEGMENT_COMMON_HELPERS__
+#define __SEGMENT_COMMON_HELPERS__
+#include <op_boilerplate.h>
+#include <NDArray.h>
+
+namespace nd4j {
+namespace ops {
+namespace helpers {
+    void fillUpSegments(NDArray* indices, Nd4jLong numClasses, NDArray& classesRangesBegs, NDArray& classesRangesLens);
+
+
+}
+}
+}
+#endif
diff --git a/libnd4j/include/ops/declarable/helpers/transforms.h b/libnd4j/include/ops/declarable/helpers/transforms.h
index ceb35ff67..1781c2f21 100644
--- a/libnd4j/include/ops/declarable/helpers/transforms.h
+++ b/libnd4j/include/ops/declarable/helpers/transforms.h
@@ -23,6 +23,7 @@
 
 #include <ops/declarable/helpers/helpers.h>
 #include <helpers/helper_random.h>
+#include <graph/RandomGenerator.h>
 
 namespace nd4j    {
 namespace ops     {
@@ -32,7 +33,7 @@ namespace helpers {
 
 	void trace(nd4j::LaunchContext * context, const NDArray& input, NDArray& output);
 
-	void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::random::RandomBuffer& rng, const bool isInplace);
+	void randomShuffle(nd4j::LaunchContext * context, NDArray& input, NDArray& output, nd4j::graph::RandomGenerator& rng, const bool isInplace);
 
     // auxiliary function which serves for recursion purpose and is used in pad operation
 	// void recursiveLoopForPad(const int mode, NDArray& input, const NDArray& paddings, NDArray& output, std::vector<int> dimensions, int dim, int inIdx, int outIdx, NDArray& padValue);
diff --git a/libnd4j/include/templatemath.h b/libnd4j/include/templatemath.h
index b95936909..0baf74dbd 100644
--- a/libnd4j/include/templatemath.h
+++ b/libnd4j/include/templatemath.h
@@ -1126,15 +1126,7 @@ inline __device__ bool nd4j_atomicAdd<bool>(bool* address, bool val)  {
 
 template <>
 inline __device__ double nd4j_atomicSub<double>(double* address, double val)  {
-	unsigned long long int* address_as_ull =
-			(unsigned long long int *) address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-		old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val -
-				__longlong_as_double(assumed)));
-	} while (assumed != old);
-	return __longlong_as_double(old);
+	return nd4j_atomicAdd<double>(address, -val);
 }
 
 template <>
@@ -1152,15 +1144,7 @@ inline __device__ double nd4j_atomicMul<double>(double* address, double val)  {
 
 template <>
 inline __device__ double nd4j_atomicDiv<double>(double* address, double val)  {
-	unsigned long long int* address_as_ull =
-			(unsigned long long int*) address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-		old = atomicCAS(address_as_ull, assumed,__double_as_longlong(val /
-				__longlong_as_double(assumed)));
-	} while (assumed != old);
-	return __longlong_as_double(old);
+	return nd4j_atomicMul<double>(address, 1./val);
 }
 
 template <>
@@ -1179,14 +1163,16 @@ inline __device__ int32_t nd4j_atomicAdd<int32_t>(int32_t* address, int32_t val)
 
 template <>
 inline __device__ float nd4j_atomicSub<float>(float* address, float val) {
-	int* address_as_ull = (int*) address;
-	int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-		old = atomicCAS(address_as_ull, assumed, __float_as_int(val -
-				__float_as_int(assumed)));
-	} while (assumed != old);
-	return __int_as_float(old);
+	return nd4j_atomicAdd<float>(address, -val);
+}
+
+template <>
+inline __device__ float16 nd4j_atomicSub<float16>(float16* address, float16 val) {
+	return nd4j_atomicAdd<float16>(address, -val);
+}
+template <>
+inline __device__ bfloat16 nd4j_atomicSub<bfloat16>(bfloat16* address, bfloat16 val) {
+	return nd4j_atomicAdd<bfloat16>(address, -val);
 }
 
 template <>
@@ -1415,6 +1401,30 @@ inline __device__ float16 nd4j_atomicMul<float16>(float16* address, float16 val)
 
 template <>
 inline __device__ float nd4j_atomicDiv<float>(float* address, float val) {
+	int* address_as_ull =
+			(int*)address;
+	int old = *address_as_ull, assumed;
+	do {
+		assumed = old;
+		old = atomicCAS(address_as_ull, assumed, __float_as_int(__int_as_float(assumed) / val ));
+	} while (assumed != old);
+	return __int_as_float(old);
+}
+
+template <>
+inline __device__ float16 nd4j_atomicDiv<float16>(float16* address, float16 val) {
+	int* address_as_ull =
+			(int*)address;
+	int old = *address_as_ull, assumed;
+	do {
+		assumed = old;
+		old = atomicCAS(address_as_ull, assumed, __float_as_int(val *
+				__float_as_int(assumed)));
+	} while (assumed != old);
+	return __int_as_float(old);
+}
+template <>
+inline __device__ bfloat16 nd4j_atomicDiv<bfloat16>(bfloat16* address, bfloat16 val) {
 	int* address_as_ull =
 			(int*)address;
 	int old = *address_as_ull, assumed;
diff --git a/libnd4j/include/types/types.h b/libnd4j/include/types/types.h
index 385861784..b11f44c6e 100644
--- a/libnd4j/include/types/types.h
+++ b/libnd4j/include/types/types.h
@@ -76,6 +76,9 @@
         (nd4j::DataType::FLOAT32, float), \
         (nd4j::DataType::DOUBLE, double)
 
+#define FLOAT_NATIVE \
+        (nd4j::DataType::FLOAT32, float), \
+        (nd4j::DataType::DOUBLE, double)
 
 #define FLOAT_TYPES_0 \
         (nd4j::DataType::HALF, float16)
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
index 51f2979cb..003c36240 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests10.cpp
@@ -1870,7 +1870,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_1) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* result = results->at(0);
-    result->printIndexedBuffer("OOOOUUUUTTT");
+//    result->printIndexedBuffer("OOOOUUUUTTT");
 
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     ASSERT_TRUE(expected.equalsTo(result));
@@ -1881,9 +1881,9 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_1) {
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_2) {
 
-    NDArray boxes    = NDArrayFactory::create<float>('c', {6,4}, {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1.f, 0.9f,
+    NDArray boxes    = NDArrayFactory::create<double>('c', {6,4}, {0, 0, 1, 1, 0, 0.1f, 1, 1.1f, 0, -0.1f, 1.f, 0.9f,
                                          0, 10, 1, 11, 0, 10.1f, 1.f, 11.1f, 0, 100, 1, 101});
-    NDArray scales = NDArrayFactory::create<float>('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f}); //3, 0, 1, 2, 4, 5
+    NDArray scales = NDArrayFactory::create<double>('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f}); //3, 0, 1, 2, 4, 5
     NDArray expected = NDArrayFactory::create<int>('c', {3}, {3,0,5});
 
     nd4j::ops::non_max_suppression op;
@@ -1892,7 +1892,7 @@ TEST_F(DeclarableOpsTests10, Image_NonMaxSuppressing_2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     NDArray* result = results->at(0);
-    result->printBuffer("NonMaxSuppression OUtput2");
+//    result->printBuffer("NonMaxSuppression OUtput2");
     ASSERT_TRUE(expected.isSameShapeStrict(result));
     ASSERT_TRUE(expected.equalsTo(result));
 
@@ -1970,6 +1970,7 @@ TEST_F(DeclarableOpsTests10, Image_CropAndResize_3) {
 
     delete results;
 }
+
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests10, Image_CropAndResize_4) {
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
index 5fd15f541..53cd4dea0 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests13.cpp
@@ -402,22 +402,219 @@ TEST_F(DeclarableOpsTests13, BarnesHutTsne_symmetrized_4) {
 
 TEST_F(DeclarableOpsTests13, CellContains_test_1) {
 
-auto corners = NDArrayFactory::create<double>( {0.5384,    0.5640,    0.3449,    0.5257,    0.5505});
-auto width = NDArrayFactory::create<double>({0.4306,    0.3960,    0.4639,    0.5040,    0.4904});
-auto point = NDArrayFactory::create<double>({0.3000,    0.2625,    0.2674,    0.8604,    0.4803});
-//auto exp = NDArrayFactory::create<double>('c', {1, 39}, {15.000000, 0.000000, 0.000000, 65.000000, 60.000000, 145.000000, 20.000000, 25.000000, 65.000000, 145.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000});
-//    data.linspace(1);
+    auto corners = NDArrayFactory::create<double>( {0.5384,    0.5640,    0.3449,    0.5257,    0.5505});
+    auto width = NDArrayFactory::create<double>({0.4306,    0.3960,    0.4639,    0.5040,    0.4904});
+    auto point = NDArrayFactory::create<double>({0.3000,    0.2625,    0.2674,    0.8604,    0.4803});
+    //auto exp = NDArrayFactory::create<double>('c', {1, 39}, {15.000000, 0.000000, 0.000000, 65.000000, 60.000000, 145.000000, 20.000000, 25.000000, 65.000000, 145.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000});
+    //    data.linspace(1);
+
+    //    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
+    //    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
+    //    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
+    nd4j::ops::cell_contains op;
+    auto result = op.execute({&corners, &width, &point}, {}, {5});
+    ASSERT_EQ(result->status(), Status::OK());
+    ASSERT_TRUE(result->at(0)->e<bool>(0));
+    //result->at(2)->printBuffer("Symmetrized3");
+    //exp.printBuffer("EXPect symm3");
+    //    ASSERT_TRUE(exp[i]->equalsTo(result->at(i)));
+    //ASSERT_TRUE(exp.equalsTo(result->at(0)));
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustHue_1) {
+
+    NDArray input('c', {2,2,3}, {0,100,56, 17,220,5,  150,97,230, 255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {100,0,44, 208,5,220, 177,230,97,  2,255,244}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_hue op;
+    auto results = op.execute({&input}, {0.5}, {2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+    // result->printIndexedBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustHue_2) {
+
+    NDArray input('c', {2,2,3}, {0,100,56, 17,220,5,  150,97,230,   255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {4,100,0,  146,220,5, 97,123.8,230, 255,2,164.8}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_hue op;
+    auto results = op.execute({&input}, {0.9}, {2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustHue_3) {
+
+    NDArray input('c', {2,2,3}, {0,100,56,    17,220,5,          150,97,230,     255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {0.,84.,100., 5.,220.,122.0001,  229.8,97.,230., 255.,142.8002,2.}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_hue op;
+    auto results = op.execute({&input}, {-0.9}, {2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustHue_4) {
+
+    NDArray input('c', {2,3,2}, {0,17,   100,220, 56,5,   150,255, 97,2,   230,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,3,2}, {100,208, 0,5,   44,220,  177,2,   230,255, 97,244}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_hue op;
+    auto results = op.execute({&input}, {0.5}, {1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustHue_5) {
+
+    NDArray input('c', {3,2,2}, {0,17, 150,255,   100,220, 97,2,  56,5, 230,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {3,2,2}, {100,208, 177,2,  0,5, 230,255,   44,220, 97,244}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_hue op;
+    auto results = op.execute({&input}, {0.5}, {0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustSaturation_1) {
+
+    NDArray input('c', {2,2,3}, {0,100,56,  17,220,5,         150,97,230,    255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {50,100,78, 118.5,220,112.5,  190,163.5,230, 255,128.5,134}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_saturation op;
+    auto results = op.execute({&input}, {0.5}, {2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+    // result->printIndexedBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustSaturation_2) {
+
+    NDArray input('c', {2,2,3}, {0,100,56,    17,220,5,          150,97,230,        255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {0.,100.,56., 12.279087,220.,0., 91.654228,0.,230., 255.,0.,11.087015}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_saturation op;
+    auto results = op.execute({&input}, {10}, {2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+    // result->printIndexedBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustSaturation_3) {
+
+    NDArray input('c', {2,2,3}, {0,100,56,       17,220,5,       150,97,230,     255,2,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,2,3}, {100.,100.,100., 220.,220.,220., 230.,230.,230., 255., 255., 255.}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_saturation op;
+    auto results = op.execute({&input}, {-10}, {2});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustSaturation_4) {
+
+    NDArray input('c', {2,3,2}, {0,17,   100,220,  56,5,   150,255,  97,2,   230,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {2,3,2}, {50,118.5, 100,220, 78,112.5,  190,255, 163.5,128.5, 230,134}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_saturation op;
+    auto results = op.execute({&input}, {0.5}, {1});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+    // result->printIndexedBuffer();
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
+
+////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests13, adjustSaturation_5) {
+
+    NDArray input('c', {3,2,2}, {0,17,     150,255,  100,220,  97,2,        56,5,     230,13}, nd4j::DataType::FLOAT32);
+    NDArray exp  ('c', {3,2,2}, {50,118.5, 190,255,  100,220,  163.5,128.5, 78,112.5, 230,134}, nd4j::DataType::FLOAT32);
+
+    nd4j::ops::adjust_saturation op;
+    auto results = op.execute({&input}, {0.5}, {0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto result = results->at(0);
+
+    ASSERT_TRUE(exp.isSameShape(result));
+    ASSERT_TRUE(exp.equalsTo(result));
+
+    delete results;
+}
 
-//    auto y = NDArrayFactory::create<double>('c', {2,3}, {-0.1,-2,3, -4, -0.5, -6});
-//    auto eps = NDArrayFactory::create<double>('c', {2,3}, {-0.1, 0.2, -0.3, 0.4, -0.5, 0.6});
-//    auto exp = NDArrayFactory::create<double>('c', {2,3}, {1, 2, 1, 2, 2, 2});
-nd4j::ops::cell_contains op;
-auto result = op.execute({&corners, &width, &point}, {}, {5});
-ASSERT_EQ(result->status(), Status::OK());
-ASSERT_TRUE(result->at(0)->e<bool>(0));
-//result->at(2)->printBuffer("Symmetrized3");
-//exp.printBuffer("EXPect symm3");
-//    ASSERT_TRUE(exp[i]->equalsTo(result->at(i)));
-//ASSERT_TRUE(exp.equalsTo(result->at(0)));
-delete result;
-}
\ No newline at end of file
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
index b4ae3c25f..42b8035b2 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp
@@ -1479,6 +1479,27 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test3) {
 
     delete results;
 }
+//////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests5, random_shuffle_test04) {
+    auto input = NDArrayFactory::create<double>('c', {4});
+    input.linspace(1);
+
+    nd4j::ops::random_shuffle op;
+    //NDArray* output;
+    auto results = op.execute({&input}, {},  {},  {}, true, nd4j::DataType::DOUBLE);
+    ASSERT_EQ(Status::OK(), results->status());
+    auto output = &input; //results->at(0);
+    bool haveZeros = false;
+    for(int i = 0; i < output->lengthOf(); ++i)
+        if(output->e<float>(i) == (float)0.)
+            haveZeros = true;
+
+    ASSERT_TRUE(input.isSameShape(output));
+    //ASSERT_TRUE(!input.equalsTo(output));
+    ASSERT_TRUE(!haveZeros);
+
+    delete results;
+}
 
 //////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests5, random_shuffle_test4) {
@@ -1486,17 +1507,17 @@ TEST_F(DeclarableOpsTests5, random_shuffle_test4) {
     input.linspace(1);
 
     nd4j::ops::random_shuffle op;
-    auto results = op.execute({&input}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
+    //NDArray* output;
+    auto results = op.execute({&input}, {},  {},  {}, false, nd4j::DataType::DOUBLE);
+    ASSERT_EQ(Status::OK(), results->status());
     auto output = results->at(0);
-
     bool haveZeros = false;
     for(int i = 0; i < output->lengthOf(); ++i)
         if(output->e<float>(i) == (float)0.)
             haveZeros = true;
 
-    ASSERT_EQ(Status::OK(), results->status());
     ASSERT_TRUE(input.isSameShape(output));
-    ASSERT_TRUE(!input.equalsTo(output));
+    //ASSERT_TRUE(!input.equalsTo(output));
     ASSERT_TRUE(!haveZeros);
 
     delete results;
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index f86870cfd..b377d71d9 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -1601,8 +1601,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output ");
-//    exp.printIndexedBuffer("Expected ");
+    z->printIndexedBuffer("Output ");
+    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1610,6 +1610,75 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) {
     delete result;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests6, MatrixInverse_01) {
+
+    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {
+            2.,  4., 60.,  8., 10.,
+            0.,  1.,  2.,  3.,  4.,
+            0.,  0.,  2.,  4.,  6.,
+            0.,  0.,  0.,  1.,  2.,
+            0.,  0.,  0.,  0.,  4.
+
+    });
+
+    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {
+            0.5, -2.0, -13.0, 54.0, -6.75,
+            0.0,  1.0,  -1.0,  1.0,   0.0,
+            0,    0,   0.5, -2.0,  0.25,
+            0,    0,     0,  1.0,  -0.5,
+            0,    0,     0,    0,  0.25
+
+    });
+
+    nd4j::ops::matrix_inverse op;
+    auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
+
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+    z->printIndexedBuffer("Output ");
+    exp.printIndexedBuffer("Expected ");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+TEST_F(DeclarableOpsTests6, MatrixInverse_02) {
+
+    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {
+            1.,  0.,  0.,  0.,  0.,
+            2.,  1.,  0.,  0.,  0.,
+            30.,  2.,  1.,  0.,  0.,
+            4.,  3.,  2.,  1.,  0.,
+            5.,  4.,  3.,  2.,  1.
+    });
+
+    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {
+            1.0,  0.0,  0.0,  0.0, 0.,
+            -2.0,  1.0,   0.,   0., 0.,
+            -26.0, -2.0,    1,    0, 0.,
+            54.0,  1.0, -2.0,    1, 0.,
+            -27.0,  0.0,  1.0, -2.0, 1.
+    });
+
+    nd4j::ops::matrix_inverse op;
+    auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
+
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+    z->printIndexedBuffer("Output ");
+    exp.printIndexedBuffer("Expected ");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 /*
@@ -1658,6 +1727,39 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_2) {
     delete result;
 }
 */
+TEST_F(DeclarableOpsTests6, MatrixInverse_03) {
+
+    auto x = NDArrayFactory::create<float>('c', {5, 5}, {
+            4.,   0.,  0.,  0.,  0.,
+            4.,   2.,  0.,  0.,  0.,
+           30.,   2.,  1.,  0.,  0.,
+            8.,   6.,  4.,  2.,  0.,
+           15.,  12.,  9.,  6.,  3.,
+    });
+
+    auto exp = NDArrayFactory::create<float>('c', {5, 5}, {
+            0.25,  0.0,    0.0,   0.0,   0.0,
+            -0.50,  0.5,    0.0,   0.0,   0.0,
+            -6.50, -1.0,    1.0,   0.0,   0.0,
+            13.50,  0.5,   -2.0,   0.5,   0.0,
+            -6.75,  0.0,    1.0,  -1.0,   0.33333333
+    });
+
+    nd4j::ops::matrix_inverse op;
+    auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
+
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+    z->printIndexedBuffer("Output ");
+    exp.printIndexedBuffer("Expected ");
+
+    ASSERT_TRUE(exp.isSameShape(z));
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, MatrixInverse_3) {
 
@@ -1695,7 +1797,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_3) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, MatrixInverse_4) {
 
-    auto x = NDArrayFactory::create<double>('c', {5, 5}, {
+    auto x = NDArrayFactory::create<float>('c', {5, 5}, {
                     1.,  2., 30.,  4.,  5.,
                     0.,  1.,  2.,  3.,  4.,
                     0.,  0.,  1.,  2.,  3.,
@@ -1703,7 +1805,7 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_4) {
                     0.,  0.,  0.,  0.,  1.
     });
 
-    auto exp = NDArrayFactory::create<double>('c', {5, 5}, {
+    auto exp = NDArrayFactory::create<float>('c', {5, 5}, {
      1.0,  -2.0,  -26.0,  54.0, -27.0,
      0.0,   1.0,  -2.0,    1.0,   0.0,
      0.0,   0.0,   1.0,   -2.0,   1.0,
@@ -1712,13 +1814,13 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_4) {
     });
 
     nd4j::ops::matrix_inverse op;
-    auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
+    auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-    //z->printIndexedBuffer("Output ");
-    //exp.printIndexedBuffer("Expected ");
+    z->printIndexedBuffer("Output ");
+    exp.printIndexedBuffer("Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
index b602daf66..d2dc3fb4d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests7.cpp
@@ -763,15 +763,15 @@ TEST_F(DeclarableOpsTests7, Test_Dynamic_Partition_119_2) {
 
 
 TEST_F(DeclarableOpsTests7, Test_SequenceMask_1) {
-    auto input = NDArrayFactory::create<double>('c', {4, 4},   {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f});
-    auto exp = NDArrayFactory::create<double>('c', {4, 4, 16}, {1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  0.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  0.f,  0.f,  0.f,  0.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  1.f,  0.f,  0.f,  0.f,  0.f, 0.f,1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  1.f,  1.f,  0.f,  0.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  1.f,  1.f,  1.f,  0.f,  0.f, 0.f,1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  1.f,  1.f,  1.f,  1.f,  0.f, 0.f,
-                                        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  1.f,  1.f,  1.f,  1.f,  1.f, 0.f,1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,  1.f,  1.f,  1.f,  1.f,  1.f,  1.f, 1.f });
+    auto input = NDArrayFactory::create<int>('c', {4, 4},   {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto exp = NDArrayFactory::create<bool>('c', {4, 4, 16}, {1, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,1, 1, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+                                        1, 1, 1, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,1, 1, 1, 1, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+                                        1, 1, 1, 1, 1, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,1, 1, 1, 1, 1, 1, 0, 0, 0,  0,  0,  0,  0,  0,  0, 0,
+                                        1, 1, 1, 1, 1, 1, 1, 0, 0,  0,  0,  0,  0,  0,  0, 0,1, 1, 1, 1, 1, 1, 1, 1, 0,  0,  0,  0,  0,  0,  0, 0,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1,  0,  0,  0,  0,  0,  0, 0,1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  0,  0,  0,  0,  0, 0,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  0,  0,  0,  0, 0,1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  0,  0,  0, 0,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  0,  0, 0,1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  0, 0,
+                                        1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 0,1, 1, 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1,  1,  1, 1 });
 
     nd4j::ops::sequence_mask op;
     auto result = op.execute({&input}, {}, {});
@@ -788,19 +788,19 @@ TEST_F(DeclarableOpsTests7, Test_SequenceMask_1) {
 }
 
 TEST_F(DeclarableOpsTests7, Test_SequenceMask_2) {
-    auto input = NDArrayFactory::create<double>('c', {2, 2, 2},   {10., 20., 30., 4., 0., 6., 7., 8.});
-    auto exp = NDArrayFactory::create<double>('c', {2, 2, 2, 30}, {   1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,    1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
-    1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-    1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,    1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
-    1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.});
+    auto input = NDArrayFactory::create<int>('c', {2, 2, 2},   {10, 20, 30, 4, 0, 6, 7, 8});
+    auto exp = NDArrayFactory::create<bool>('c', {2, 2, 2, 30}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+             1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+             1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
 
     nd4j::ops::sequence_mask op;
     auto result = op.execute({&input}, {}, {});
     ASSERT_EQ(Status::OK(), result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("Output");
+//    z->printBuffer("Output");
 //    z->printShapeInfo("Shape");
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index c4c59a9e2..8d97c660d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -2770,9 +2770,8 @@ TEST_F(DeclarableOpsTests9, batchnorm_bp_test3) {
     ASSERT_TRUE(isGradCorrect);
 }
 
-////////////////////////////////////////////////////////////////////
 /*
-//2019/02/23 AB - GRU backprop tests disabled pending update of GRU backprop op after rewriting forward pass
+////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, gru_cell_bp_test1) {
 
     const int bS = 2;
@@ -2780,160 +2779,58 @@ TEST_F(DeclarableOpsTests9, gru_cell_bp_test1) {
     const int nU = 4;
 
     NDArray x('c', {bS, iS}, nd4j::DataType::DOUBLE);
-    NDArray h0('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray Wx('c', {iS, 3*nU}, nd4j::DataType::DOUBLE);
-    NDArray Wh('c', {nU, 3*nU}, nd4j::DataType::DOUBLE);
-    NDArray b('c', {3*nU}, nd4j::DataType::DOUBLE);
+    NDArray hi('c', {bS, nU}, nd4j::DataType::DOUBLE);
+    NDArray W('c', {iS+nU, 2*nU}, nd4j::DataType::DOUBLE);
+    NDArray Wc('c', {iS+nU, nU}, nd4j::DataType::DOUBLE);
+    NDArray b('c', {2*nU}, nd4j::DataType::DOUBLE);
+    NDArray bc('c', {nU}, nd4j::DataType::DOUBLE);
+    NDArray dLdr('c', {bS, nU}, nd4j::DataType::DOUBLE);
+    NDArray dLdu('c', {bS, nU}, nd4j::DataType::DOUBLE);
+    NDArray dLdc('c', {bS, nU}, nd4j::DataType::DOUBLE);
     NDArray dLdh('c', {bS, nU}, nd4j::DataType::DOUBLE);
 
-    x.linspace(0.5, 0.5);
-    h0 = 1.;
-    Wx = 0.003;
-    Wh = 0.006;
-    b  = 0.5;
+    x.linspace(-5, 0.5);
+    hi   = 1.;
+    W    = 0.003;
+    Wc   = 0.006;
+    b    = 0.5;
+    bc   = 0.35;
 
-    const OpArgsHolder argsHolderFF({&x, &h0, &Wx, &Wh, &b}, {}, {});
-    const OpArgsHolder argsHolderBP({&x, &h0, &Wx, &Wh, &b, &dLdh}, {}, {});
+
+    const OpArgsHolder argsHolderFF({&x, &hi, &W, &Wc, &b, &bc}, {}, {});
+    nd4j::ops::gruCell op;
+    auto results = op.execute(argsHolderFF);
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    auto u = results->at(1);    // [bS, nU]
+    auto c = results->at(2);    // [bS, nU]
+    auto h = results->at(3);    // [bS, nU]
+
+    dLdh = 1.; // SUM loss
+
+    NDArray Wch = Wc({iS,iS+nU, 0,0}); // [nU, nU]
+    NDArray dhdc  = 1. - *u;
+    NDArray dhdu  = hi - *c;
+    NDArray dcdZc = 1. - *c * *c;
+    dLdc.assign(dLdh * dhdc);
+    dLdu.assign(dLdh * dhdu);
+    dLdr.assign(mmul(dLdc * dcdZc * hi, Wch.transpose()));
+
+    delete results;
+
+
+    const OpArgsHolder argsHolderBP({&x, &hi, &W, &Wc, &b, &bc, &dLdr, &dLdu, &dLdc, &dLdh}, {}, {});
 
     nd4j::ops::gruCell opFF;
     nd4j::ops::gruCell_bp opBP;
 
-    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
-
-    ASSERT_TRUE(isGradCorrect);
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, gru_cell_bp_test2) {
-
-    const int bS = 2;
-    const int iS = 3;
-    const int nU = 4;
-
-    NDArray x('c', {bS, iS}, nd4j::DataType::DOUBLE);
-    NDArray h0('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray Wx('c', {iS, 3*nU}, nd4j::DataType::DOUBLE);
-    NDArray Wh('c', {nU, 3*nU}, nd4j::DataType::DOUBLE);
-    NDArray b('c', {3*nU}, nd4j::DataType::DOUBLE);
-    NDArray dLdh('c', {bS, nU}, nd4j::DataType::DOUBLE);
-
-    x.linspace(0.5, 0.5);
-    h0 = 1.;
-    Wx = 0.003;
-    Wh = 0.006;
-    b  = 0.;
-
-    const OpArgsHolder argsHolderFF({&x, &h0, &Wx, &Wh, &b}, {}, {});
-    const OpArgsHolder argsHolderBP({&x, &h0, &Wx, &Wh, &b, &dLdh}, {}, {});
-
-    nd4j::ops::gruCell opFF;
-    nd4j::ops::gruCell_bp opBP;
-
-    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
-
-    ASSERT_TRUE(isGradCorrect);
-}
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, gru_cell_bp_test3) {
-
-    const int bS = 2;
-    const int iS = 3;
-    const int nU = 4;
-
-    NDArray x('c', {bS, iS}, nd4j::DataType::DOUBLE);
-    NDArray h0('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    NDArray Wx('c', {iS, 3*nU}, nd4j::DataType::DOUBLE);
-    NDArray Wh('c', {nU, 3*nU}, nd4j::DataType::DOUBLE);
-    NDArray b('c', {3*nU}, nd4j::DataType::DOUBLE);
-    NDArray dLdh('c', {bS, nU}, nd4j::DataType::DOUBLE);
-    // NDArray<double> dLdWx0('c', {iS, 3*nU});
-    // NDArray<double> dLdWh0('c', {nU, 3*nU});
-    // NDArray<double> dLdb0 ('c', {3*nU});
-
-    x = 1.;
-    h0 = 0.0;
-    Wx = 0.0;
-    Wh = 0.0;
-    b  = 0.5;
-
-    const OpArgsHolder argsHolderFF({&x, &h0, &Wx, &Wh, &b}, {}, {});
-    const OpArgsHolder argsHolderBP({&x, &h0, &Wx, &Wh, &b, &dLdh}, {}, {});
-
-    nd4j::ops::gruCell opFF;
-    nd4j::ops::gruCell_bp opBP;
-
-    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
-
-    ASSERT_TRUE(isGradCorrect);
-}
-
-////////////////////////////////////////////////////////////////////
-// TEST_F(DeclarableOpsTests9, gru_bp_test1) {
-
-//     const int time = 5;
-//     const int bS   = 2;
-//     const int iS   = 3;
-//     const int nU   = 4;
-
-//     NDArray<double> x     ('c', {time, bS, iS});
-//     NDArray<double> h0    ('c', {bS, nU});
-//     NDArray<double> Wx    ('c', {iS, 3*nU});
-//     NDArray<double> Wh    ('c', {nU, 3*nU});
-//     NDArray<double> b     ('c', {3*nU});
-//     NDArray<double> dLdh  ('c', {time, bS, nU});
-
-//     x.linspace(0.5, 0.5);
-//     h0 = 1.;
-//     Wx = 0.003;
-//     Wh = 0.006;
-//     b  = 0.5;
-
-//     const OpArgsHolder<double> argsHolderFF({&x, &h0, &Wx, &Wh, &b}, {}, {});
-//     const OpArgsHolder<double> argsHolderBP({&x, &h0, &Wx, &Wh, &b, &dLdh}, {}, {});
-
-//     nd4j::ops::gru<double> opFF;
-//     nd4j::ops::gru_bp<double> opBP;
-
-//     const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
-
-//     ASSERT_TRUE(isGradCorrect);
-// }
-
-////////////////////////////////////////////////////////////////////
-TEST_F(DeclarableOpsTests9, gru_cell_bp_test3_1) {
-
-    const int bS = 2;
-    const int iS = 3;
-    const int nU = 4;
-
-    auto x  = NDArrayFactory::create<double>('c', {bS, iS});
-    auto h0  = NDArrayFactory::create<double>('c', {bS, nU});
-    auto Wx  = NDArrayFactory::create<double>('c', {iS, 3*nU});
-    auto Wh  = NDArrayFactory::create<double>('c', {nU, 3*nU});
-    auto b  = NDArrayFactory::create<double>('c', {3*nU});
-    auto dLdh  = NDArrayFactory::create<double>('c', {bS, nU});
-    // NDArray<double> dLdWx0('c', {iS, 3*nU});
-    // NDArray<double> dLdWh0('c', {nU, 3*nU});
-    // NDArray<double> dLdb0 ('c', {3*nU});
-
-    x = 1.;
-    h0 = 0.0;
-    Wx = 0.0;
-    Wh = 0.0;
-    b  = 0.5;
-
-    const OpArgsHolder argsHolderFF({&x, &h0, &Wx, &Wh, &b}, {}, {});
-    const OpArgsHolder argsHolderBP({&x, &h0, &Wx, &Wh, &b, &dLdh}, {}, {});
-
-    nd4j::ops::gruCell opFF;
-    nd4j::ops::gruCell_bp opBP;
-
-    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP);
+    const bool isGradCorrect = GradCheck::checkGrad(opFF, opBP, argsHolderFF, argsHolderBP, {1, 1, 1, 1 , 1, 1}, {0., 1.}, nd4j::GradCheck::LossFunc::SUM, true);
 
     ASSERT_TRUE(isGradCorrect);
 }
 */
+
 ////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests9, Cholesky_Test_1) {
 
diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
index 8f178e976..b6981a5c3 100644
--- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
@@ -719,6 +719,7 @@ TEST_F(ParityOpsTests, Test_Scatter_Add_1) {
 }
 
 TEST_F(ParityOpsTests, Test_Scatter_Add_2) {
+
     auto vec = NDArrayFactory::create<float>('c', {4}, {1, 2, 3, 4});
     NDArray idc('c', {1, 4}, {0, 1, 2, 3}, nd4j::DataType::INT64);
     auto updates = NDArrayFactory::create<float>('c', {1, 4}, {1, 1, 1, 1});
@@ -1588,36 +1589,79 @@ TEST_F(ParityOpsTests, scatterND_update_test5) {
     delete result;
 }
 
+//////////////////////////////////////////////////////////////////////
 TEST_F(ParityOpsTests, scatter_update_1) {
-    auto matrix  = NDArrayFactory::create_<float>('c', {3, 2});
-    auto updates = NDArrayFactory::create_<float>('c', {2, 2});
-    updates->assign(1.0);
 
-    //updates.printBuffer("Updates");
+    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
+    NDArray updates('c', {2,2}, {10,20,30,40}, nd4j::DataType::INT32);
 
-    auto variableSpace = new VariableSpace();
-    variableSpace->putVariable(-1, matrix);
-    variableSpace->putVariable(-2, updates);
-    variableSpace->putVariable(1, new Variable(&matrix));
-
-    auto block = new Context(1, variableSpace, false);
-    block->fillInputs({-1, -2});
-
-    std::vector<int>* arguments = block->getIArguments();
-    arguments->push_back(0);
-    arguments->push_back(1);
-    arguments->push_back(1);
-    arguments->push_back(2);
-    arguments->push_back(1);
-    arguments->push_back(2);
+    NDArray exp('c', {2,2}, {30,40,10,20}, nd4j::DataType::INT32);
 
     nd4j::ops::scatter_update op;
+    auto results = op.execute({&x, &updates}, {}, {6,   1,1,  2,1,0});
 
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+    // x.printBuffer();
 
-    Nd4jStatus result = op.execute(block);
-    ASSERT_EQ(ND4J_STATUS_OK, result);
+    ASSERT_TRUE(exp.isSameShape(x));
+    ASSERT_TRUE(exp.equalsTo(x));
 
-    delete block;
-    delete variableSpace;
+    delete results;
 }
 
+//////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatter_update_2) {
+
+    NDArray x('c', {2,2}, {1,2,3,4}, nd4j::DataType::INT32);
+    NDArray updates('c', {2,2}, {10,20,30,40}, nd4j::DataType::INT32);
+
+    NDArray exp('c', {2,2}, {20,10,40,30}, nd4j::DataType::INT32);
+
+    nd4j::ops::scatter_update op;
+    auto results = op.execute({&x, &updates}, {}, {6,   1,0,  2,1,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    ASSERT_TRUE(exp.isSameShape(x));
+    ASSERT_TRUE(exp.equalsTo(x));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatter_update_3) {
+
+    NDArray x('c', {2,2,2}, {1,2,3,4,5,6,7,8}, nd4j::DataType::INT32);
+    NDArray updates('c', {2,2,2}, {10,20,30,40,50,60,70,80}, nd4j::DataType::INT32);
+
+    NDArray exp('c', {2,2,2}, {50,60,70,80,10,20,30,40}, nd4j::DataType::INT32);
+
+    nd4j::ops::scatter_update op;
+    auto results = op.execute({&x, &updates}, {}, {6,  2,1,2,  2,1,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    ASSERT_TRUE(exp.isSameShape(x));
+    ASSERT_TRUE(exp.equalsTo(x));
+
+    delete results;
+}
+
+//////////////////////////////////////////////////////////////////////
+TEST_F(ParityOpsTests, scatter_update_4) {
+
+    NDArray x('c', {2,2,2}, {1,2,3,4,5,6,7,8}, nd4j::DataType::INT32);
+    NDArray updates('c', {2,2,2}, {10,20,30,40,50,60,70,80}, nd4j::DataType::INT32);
+
+    NDArray exp('c', {2,2,2}, {20,2,3,10,60,6,7,50}, nd4j::DataType::INT32);
+
+    nd4j::ops::scatter_update op;
+    auto results = op.execute({&x, &updates}, {}, {6,  1,0,  2,3,0});
+
+    ASSERT_EQ(ND4J_STATUS_OK, results->status());
+
+    ASSERT_TRUE(exp.isSameShape(x));
+    ASSERT_TRUE(exp.equalsTo(x));
+
+    delete results;
+}
diff --git a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
index 859a97bba..e563c8683 100644
--- a/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/RNGTests.cpp
@@ -278,8 +278,8 @@ TEST_F(RNGTests, Test_Gaussian_22) {
     auto x0 = NDArrayFactory::create<float>('c', {10000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {10000, 1000});
 
-    RandomLauncher::fillGaussian(_rngA, &x0, 0.0f, 1.0f);
-    RandomLauncher::fillGaussian(_rngB, &x1, 0.0f, 1.0f);
+    RandomLauncher::fillGaussian(nd4j::LaunchContext::defaultContext(), _rngA, &x0, 0.0f, 1.0f);
+    RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngB, &x1, 0.0f, 1.0f);
 
     //x0.printIndexedBuffer("x0");
     //x1.printIndexedBuffer("x1");
@@ -306,7 +306,7 @@ TEST_F(RNGTests, Test_Gaussian_22) {
 TEST_F(RNGTests, Test_Gaussian_3) {
     auto x0 = NDArrayFactory::create<double>('c', {10000000});
 
-    RandomLauncher::fillGaussian(_rngA, &x0, 0.0, 1.0);
+    RandomLauncher::fillGaussian(LaunchContext::defaultContext(), _rngA, &x0, 0.0, 1.0);
 
     auto mean = x0.meanNumber().e<double>(0);
     auto stdev = x0.varianceNumber(nd4j::variance::SummaryStatsStandardDeviation, false).e<double>(0);
@@ -319,8 +319,8 @@ TEST_F(RNGTests, Test_LogNormal_1) {
     auto x0 = NDArrayFactory::create<float>('c', {10, 10});
     auto x1 = NDArrayFactory::create<float>('c', {10, 10});
 
-    RandomLauncher::fillLogNormal(_rngA, &x0, 1.0f, 2.0f);
-    RandomLauncher::fillLogNormal(_rngB, &x1, 1.0f, 2.0f);
+    RandomLauncher::fillLogNormal(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
+    RandomLauncher::fillLogNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -333,8 +333,8 @@ TEST_F(RNGTests, Test_Truncated_1) {
     auto x0 = NDArrayFactory::create<float>('c', {10, 10});
     auto x1 = NDArrayFactory::create<float>('c', {10, 10});
 
-    RandomLauncher::fillTruncatedNormal(_rngA, &x0, 1.0f, 2.0f);
-    RandomLauncher::fillTruncatedNormal(_rngB, &x1, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -357,8 +357,8 @@ TEST_F(RNGTests, Test_Truncated_2) {
     auto x0 = NDArrayFactory::create<float>('c', {1000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {1000, 1000});
 
-    RandomLauncher::fillTruncatedNormal(_rngA, &x0, 1.0f, 2.0f);
-    RandomLauncher::fillTruncatedNormal(_rngB, &x1, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -383,8 +383,8 @@ TEST_F(RNGTests, Test_Truncated_21) {
     auto x0 = NDArrayFactory::create<float>('c', {1000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {1000, 1000});
 
-    RandomLauncher::fillTruncatedNormal(_rngA, &x0, 1.0f, 2.0f);
-    RandomLauncher::fillTruncatedNormal(_rngB, &x1, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -430,8 +430,8 @@ TEST_F(RNGTests, Test_Truncated_22) {
     auto x0 = NDArrayFactory::create<float>('c', {1000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {1000, 1000});
 
-    RandomLauncher::fillTruncatedNormal(_rngA, &x0, 2.0f, 4.0f);
-    RandomLauncher::fillTruncatedNormal(_rngB, &x1, 2.0f, 4.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngA, &x0, 2.0f, 4.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 2.0f, 4.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -477,8 +477,8 @@ TEST_F(RNGTests, Test_Truncated_23) {
     auto x0 = NDArrayFactory::create<float>('c', {1000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {1000, 1000});
 
-    RandomLauncher::fillTruncatedNormal(_rngA, &x0, 0.0f, 1.0f);
-    RandomLauncher::fillTruncatedNormal(_rngB, &x1, 0.0f, 1.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngA, &x0, 0.0f, 1.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 0.0f, 1.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -524,8 +524,8 @@ TEST_F(RNGTests, Test_Truncated_3) {
     auto x0 = NDArrayFactory::create<float>('c', {10000, 1000});
     auto x1 = NDArrayFactory::create<float>('c', {10000, 1000});
 
-    RandomLauncher::fillTruncatedNormal(_rngA, &x0, 1.0f, 2.0f);
-    RandomLauncher::fillTruncatedNormal(_rngB, &x1, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngA, &x0, 1.0f, 2.0f);
+    RandomLauncher::fillTruncatedNormal(LaunchContext::defaultContext(), _rngB, &x1, 1.0f, 2.0f);
 
     ASSERT_TRUE(x0.equalsTo(&x1));
 
@@ -964,7 +964,7 @@ TEST_F(RNGTests, Test_Reproducibility_2) {
 TEST_F(RNGTests, Test_Uniform_4) {
     auto x1 = NDArrayFactory::create<double>('c', {1000000});
 
-    RandomLauncher::fillUniform(_rngB, &x1, 1.0, 2.0);
+    RandomLauncher::fillUniform(LaunchContext::defaultContext(), _rngB, &x1, 1.0, 2.0);
 
     /* Check up distribution */
     auto mean = x1.reduceNumber(reduce::Mean);
diff --git a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
index 65df94873..71144527a 100644
--- a/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
+++ b/libnd4j/tests_cpu/layers_tests/SortCudaTests.cu
@@ -69,6 +69,24 @@ TEST_F(SortCudaTests, test_linear_sort_by_val_1) {
     ASSERT_EQ(ev, v);
 }
 
+TEST_F(SortCudaTests, test_linear_sort_by_val_2) {
+    auto k = NDArrayFactory::create<int>('c', {6}, {0, 1, 2, 3, 4, 5});
+//    auto v = NDArrayFactory::create<double>('c', {6}, {1.5, 3.5, 5.5, 9.5, 0.5, 2.5, 4.5, 6.5, 7.5, 8.5});
+    NDArray v = NDArrayFactory::create<double>('c', {6}, {0.9f, .75f, .6f, .95f, .5f, .3f});
+    auto ek = NDArrayFactory::create<int>('c', {6}, {3, 0, 1, 2, 4, 5});
+    auto ev = NDArrayFactory::create<double>('c', {6}, {0.95, 0.9, 0.75, 0.6, 0.5, 0.3});
+
+    Nd4jPointer extras[2] = {nullptr, LaunchContext::defaultContext()->getCudaStream()};
+
+    NativeOps nativeOps;
+    nativeOps.sortByValue(extras, k.buffer(), k.shapeInfo(), k.specialBuffer(), k.specialShapeInfo(), v.buffer(), v.shapeInfo(), v.specialBuffer(), v.specialShapeInfo(), true);
+    k.tickWriteDevice();
+    v.tickWriteDevice();
+    k.printIndexedBuffer("KEYS");
+    ASSERT_EQ(ek, k);
+    ASSERT_EQ(ev, v);
+}
+
 TEST_F(SortCudaTests, test_tad_sort_by_key_1) {
     auto k = NDArrayFactory::create<Nd4jLong>('c', {2, 10}, {1, 3, 5, 9, 0, 2, 4, 6, 7, 8,   1, 3, 5, 9, 0, 2, 4, 6, 7, 8});
     auto v = NDArrayFactory::create<double>('c', {2, 10}, {1.5, 3.5, 5.5, 9.5, 0.5, 2.5, 4.5, 6.5, 7.5, 8.5,   1.5, 3.5, 5.5, 9.5, 0.5, 2.5, 4.5, 6.5, 7.5, 8.5});