diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 8abee8d82..6c5f6a8c8 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -501,7 +501,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
                 auto cdata = data + offsets[e];
                 if (dataType == DataType::UTF16) {
                     unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e]));
@@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
              auto cdata = data + offsets[e];
              if (dataType == DataType::UTF16) {
                  unicode::utf8to16(string[e].data(), cdata, string[e].size());
@@ -635,7 +635,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
              auto cdata = data + offsets[e];
              if (dtype == DataType::UTF16) {
                  memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t));
@@ -701,7 +701,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
 
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
              auto cdata = data + offsets[e];
              if (dtype == DataType::UTF16) {
                  memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t));
@@ -767,7 +767,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dtype == DataType::UTF16) {
                 unicode::utf32to16(string[e].data(), cdata, string[e].size());
@@ -833,7 +833,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
     auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dtype == DataType::UTF16) {
                 unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e]));
@@ -2367,7 +2367,7 @@ NDArray NDArray::asS() const {
     const auto inData = bufferAsT<int8_t>() + offsetsLength;
 
     auto func = PRAGMA_THREADS_FOR{
-        for (int e = start; e < stop; e += increment) {
+        for (int e = start; e < stop; e++) {
            auto cdata = outData + offsets[e];
            auto end = nInputoffsets[e + 1];
            auto idata = inData + nInputoffsets[e];
@@ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const {
             std::vector<std::string> strings(lengthOf());
 
             auto func = PRAGMA_THREADS_FOR{
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
                            strings[i] = std::move(this->e<std::string>(i));
                     }
             };
@@ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const {
             std::vector<std::u16string> strings(lengthOf());
 
             auto func = PRAGMA_THREADS_FOR{
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
                            strings[i] = std::move(this->e<std::u16string>(i));
                     }
             };
@@ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const {
 
         std::vector<std::u32string> strings(lengthOf());
         auto func = PRAGMA_THREADS_FOR{
-               for (auto i = start; i < stop; i += increment) {
+               for (auto i = start; i < stop; i++) {
                       strings[i] = std::move(this->e<std::u32string>(i));
                }
         };
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp
index 9bdf41a16..58d4b3c34 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
 
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             shape::index2coords(i, target.getShapeInfo(), coords);
             const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
 
@@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
     auto y = reinterpret_cast<T *>(yBuffer);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             auto temp = x[i];
             x[i] = y[i];
             y[i] = temp;
@@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     if(result.ordering() == 'c') {           //  ews == 1 always here
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
             }
@@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
     else {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto xOffset = result.getOffset(i);
                 auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
                 BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
@@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
     // loop through input array
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             shape::index2coords(i, output.getShapeInfo(), coords);
 
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
diff --git a/libnd4j/blas/cpu/NDArrayLambda.hpp b/libnd4j/blas/cpu/NDArrayLambda.hpp
index 86d798efc..8bced3de4 100644
--- a/libnd4j/blas/cpu/NDArrayLambda.hpp
+++ b/libnd4j/blas/cpu/NDArrayLambda.hpp
@@ -22,7 +22,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
     if (this->ordering() == second.ordering() && this->ordering() == third.ordering()  && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(f[e], s[e], t[e]);
         };
 
@@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto tOffset = this->getOffset(e);
                     auto uOffset = second.getOffset(e);
                     auto vOffset = third.getOffset(e);
@@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto tOffset = this->getOffset(e);
                     auto uOffset = second.getOffset(e);
                     auto vOffset = third.getOffset(e);
@@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(f[e], s[e]);
         };
 
@@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
 
@@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
                     auto zOffset = target.getOffset(e);
@@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(f[e]);
         };
 
@@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
 
                     f[xOffset] = func(f[xOffset]);
@@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto zOffset = target.getOffset(e);
 
@@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
     if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func(e, f[e]);
         };
 
@@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
 
                     f[xOffset] = func(e, f[xOffset]);
@@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto zOffset = target.getOffset(e);
 
@@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
     if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                 z[e] = func((Nd4jLong) e, f[e], s[e]);
         };
 
@@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
         if (f == z) {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
 
@@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
         } else {
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto xOffset = this->getOffset(e);
                     auto yOffset = other.getOffset(e);
                     auto zOffset = target.getOffset(e);
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index b945c5bcf..e82f2224e 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx,
     _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto idx = start; idx < stop; idx += increment) {
+        for (auto idx = start; idx < stop; idx++) {
             auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
             auto zTadOffsetForBlock = zTadOffsets[idx];
 
@@ -1356,7 +1356,7 @@ void tearGeneric(void *vx,
     auto numTads = shape::length(hXShapeInfo) / tadLength;
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             auto hZ = reinterpret_cast<T *>(targets[i]);
             auto s = hX + tadOffsets[i];
 
@@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
     auto dZ = reinterpret_cast<T **>(dz);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto f = start; f < stop; f += increment) {
+        for (auto f = start; f < stop; f++) {
             auto hX = reinterpret_cast<T *>(dX[f]);
             //auto hZ = reinterpret_cast<T *>(dZ[f]);
 
diff --git a/libnd4j/include/array/DataTypeConversions.h b/libnd4j/include/array/DataTypeConversions.h
index 3af77ca39..abc804f5e 100644
--- a/libnd4j/include/array/DataTypeConversions.h
+++ b/libnd4j/include/array/DataTypeConversions.h
@@ -52,7 +52,7 @@ namespace nd4j {
                                 TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
 #else
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto e = start; e < stop; e += increment)
+                    for (auto e = start; e < stop; e++)
                         buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                 };
 
@@ -110,7 +110,7 @@ namespace nd4j {
                                 TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
 #else
                             auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                             };
 
@@ -138,7 +138,7 @@ namespace nd4j {
 
 #else
                             auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                             };
 
@@ -164,7 +164,7 @@ namespace nd4j {
                                 TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
 #else
                             auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                     buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                             };
 
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
index 1aaaaebc7..b661d02e7 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
@@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         case nd4j::LoopKind::EWS1: {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         case nd4j::LoopKind::EWSNONZERO: {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
         case nd4j::LoopKind::RANK1: {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(2, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(3, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(4, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             shape::updateStrides(5, tadShape, newStride, 'c');
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
@@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
             const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto tad = const_cast<X *>(x) + tadOffsets[i];
                     auto indexValue = OpType::startingIndexValue(tad);
 
diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
index 95fe19109..f047d1136 100644
--- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
+++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
@@ -80,7 +80,7 @@ namespace nd4j {
                 int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
 
                 auto func = PRAGMA_THREADS_FOR{
-                     for (uint32_t total = start; total < stop; total += increment) {
+                     for (uint32_t total = start; total < stop; total++) {
 
                         uint32_t i = total / zDim1;
                         uint32_t j = total % zDim1;
diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp
index 829f60a18..8d3af7eb4 100644
--- a/libnd4j/include/loops/cpu/indexreduce.hpp
+++ b/libnd4j/include/loops/cpu/indexreduce.hpp
@@ -73,7 +73,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
         auto func = PRAGMA_THREADS_FOR {
             intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 IndexValue<X> curr(x[i], i);
                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
             }
@@ -88,7 +88,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
         auto func = PRAGMA_THREADS_FOR {
             intermediatery[thread_id] = OpType::startingIndexValue(x);
 
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                 IndexValue<X> curr(x[offset], i);
                 intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp
index 35674de36..ab9793694 100644
--- a/libnd4j/include/loops/cpu/random.hpp
+++ b/libnd4j/include/loops/cpu/random.hpp
@@ -75,7 +75,7 @@ namespace functions {
 
                     auto func = PRAGMA_THREADS_FOR {
                         PRAGMA_OMP_SIMD
-                        for (auto i = start; i < stop; i += increment)  {
+                        for (auto i = start; i < stop; i++)  {
                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                         }
@@ -93,7 +93,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@@ -111,7 +111,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@@ -129,7 +129,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@@ -149,7 +149,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@@ -197,7 +197,7 @@ namespace functions {
                 else{
                     auto func = PRAGMA_THREADS_FOR {
                         PRAGMA_OMP_SIMD
-                        for (uint64_t i = start; i < stop; i += increment)  {
+                        for (uint64_t i = start; i < stop; i++)  {
                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                         }
@@ -213,7 +213,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@@ -255,7 +255,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                         auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[offset] = OpClass::op(i, length, rng, extraArguments);
                     }
diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp
index 8d50aedbc..c24a3d474 100644
--- a/libnd4j/include/loops/cpu/reduce3.hpp
+++ b/libnd4j/include/loops/cpu/reduce3.hpp
@@ -88,7 +88,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
 
     if (kindOfLoop == nd4j::LoopKind::EWS1) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
             }
         };
@@ -98,7 +98,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
     } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
             }
@@ -110,7 +110,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                 auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                 intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index a8f766f6a..2e36b8085 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -158,7 +158,7 @@ namespace functions {
             const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
 
                     auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
                     auto tx = x + tadOffsetForBlock;
diff --git a/libnd4j/include/loops/impl/type_conversions.cpp b/libnd4j/include/loops/impl/type_conversions.cpp
index b12ff5796..36c95e731 100644
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@@ -81,7 +81,7 @@ namespace nd4j {
 
         // now we actually apply quantization
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
             }
         };
@@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         int flimit = limit + 4;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 int el = x[e];
                 int ael = nd4j::math::nd4j_abs<int>(el) - 1;
                 z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
@@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
         auto z = reinterpret_cast<T *>(dz);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 z[i] = static_cast<T>(static_cast<float>(x[i]));
             }
         };
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
index f8704d7b0..baf19de10 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@@ -153,7 +153,7 @@ namespace helpers {
         auto rowSize = sizeof(T) * colCount;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto n = start; n < stop; n += increment) {
+            for (auto n = start; n < stop; n++) {
                 int s = rowP->e<int>(n);
                 int end = rowP->e<int>(n + 1);
                 int shift = n * colCount;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index 56c93b611..2e63c9d5e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
             shape::calcOffsets(tadShapeInfo, offsets);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
                     auto outBuff = output.bufferAsT<T>() + tadOffsets[i];
 
@@ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
     const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             // FIXME: double!
             double x = input.e<double>(i);
             if (x < 0.0) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
index 978c037fa..5a22b02eb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
 
                 const T *xTad = x + packX.platformOffsets()[i];
                 T *zTad = z + packZ.platformOffsets()[i];
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
index d4b0de398..594280ebe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 const T *xTad = x + packX.platformOffsets()[i];
                 T *zTad = z + packZ.platformOffsets()[i];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
index b408da720..c63dc3c1c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
@@ -94,7 +94,7 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
         int vaSize = vA.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto p = start; p < stop; p += increment) {
+            for (auto p = start; p < stop; p++) {
                 auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
                 auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
                 auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index ad2e29a97..aa9624600 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
 
         Nd4jLong coords[MAX_RANK];
 
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
 
             shape::index2coords(i, input->getShapeInfo(), coords);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
index 83cc966ba..5e80d12fb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 	int xLen = x.lengthOf();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
     };
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
index 5aad38da8..26f82bdd9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
         auto func = PRAGMA_THREADS_FOR {
             T *col, *im;
 
-            for (uint b = start; b < stop; b += increment) {
+            for (uint b = start; b < stop; b++) {
                 T *im0 = imBuff + b * imStride0;
                 T *col4 = colBuff + b * colStride0;
                 for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
new file mode 100644
index 000000000..1bdf0a6ad
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
+//
+
+
+#include <ops/declarable/helpers/transforms.h>
+#include <ops/specials.h>
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            //////////////////////////////////////////////////////////////////////////
+            template<typename T>
+            static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+                nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
+            }
+
+            void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+                BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
+            }
+
+            BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
+        }
+    }
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
index 4f8989caf..39449c7f8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
@@ -32,7 +32,7 @@ namespace helpers {
         int lLen = labels->lengthOf();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (int j = start; j < stop; j += increment) {
+            for (int j = start; j < stop; j++) {
                 auto label = labels->e<Nd4jLong>(j);
                 auto pred = predictions->e<Nd4jLong>(j);
                 T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
index ca30d73bd..1f55378c0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
@@ -50,7 +50,7 @@ namespace nd4j {
                     T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto y = start; y < stop; y += increment) {
+                        for (auto y = start; y < stop; y++) {
                             const float inY = (cropHeight > 1)
                                               ? y1 * (imageHeight - 1) + y * heightScale
                                               : 0.5 * (y1 + y2) * (imageHeight - 1);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
index c12b1ce4f..6a8523925 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
@@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
     int tads = tadsA.size();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto a_ = tadsA.at(e);
             auto b_ = tadsB.at(e);
             auto o_ = tadsO.at(e);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
index f041452ab..d3e524ff4 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
@@ -46,7 +46,7 @@ namespace helpers {
         if (isNHWC) {
             const int total_count = batch_size * output_height * output_width * output_depth;
             auto func = PRAGMA_THREADS_FOR {
-                for (auto out_idx = start; out_idx < stop; out_idx += increment) {
+                for (auto out_idx = start; out_idx < stop; out_idx++) {
                     const int d = out_idx % output_depth;
                     const int out_idx2 = out_idx / output_depth;
                     const int w = out_idx2 % output_width;
@@ -70,7 +70,7 @@ namespace helpers {
             const int total_count = batch_size * input_depth_by_input_area;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (int input_idx = start; input_idx < stop; input_idx += increment) {
+                for (int input_idx = start; input_idx < stop; input_idx++) {
                     const int n_bY_bX_oC_iY = input_idx / input_width;
                     const int iX = input_idx - n_bY_bX_oC_iY * input_width;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
index 8035f8216..2a51b92a6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
@@ -32,7 +32,7 @@ template <typename T>
 static void diGamma_(const NDArray& x, NDArray& z) {
 
 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             z.p(i, diGammaScalar<T>(x.e<T>(i)));
     };
 	samediff::Threads::parallel_for(func, 0, x.lengthOf());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index 9db974b36..a470f140a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -35,7 +35,7 @@ namespace helpers {
         int inLen = input->lengthOf();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));
 
                 if (val < probValue)
@@ -130,7 +130,7 @@ namespace helpers {
         nd4j::graph::RandomGenerator nodeRng(3019L, seed);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
                 float xVal = input->e<float>(e);
                 output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
index 281e6c809..0673a6f2b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@@ -62,7 +62,7 @@ namespace nd4j {
                     unsigned int outSize = outputList.size();
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
                             outputs[i].first = outputList[i];
                             outputs[i].second = 0;
                             for (int e = 0; e < indices->lengthOf(); ++e)
@@ -168,7 +168,7 @@ namespace nd4j {
                     unsigned int gradsSize = inputGradientList.size();
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
                             outputs[i].first = inputGradientList[i];
                             outputs[i].second = 0;
                             for (int e = 0; e < indices->lengthOf(); ++e)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
index 0a46c995e..b2707ea5c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
@@ -50,7 +50,7 @@ namespace helpers {
             colCast = 0;
 
        auto func = PRAGMA_THREADS_FOR {
-           for (auto batch = 0; batch < stop; batch += increment) {
+           for (auto batch = 0; batch < stop; batch++) {
                auto patch = listOfMatricies.at(batch);
                auto outMatrix = listOfOutputs.at(batch);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
index 09c8c09ea..ed844e84f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@@ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
             if(input->rankOf() == 1 && output->rankOf() == 1) {
 
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto i = start; i < stop; i += increment)
+                    for (auto i = start; i < stop; i++)
                         output->p(i, input->e(indices->e<Nd4jLong>(i)));
                 };
 
@@ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
 
                     auto func = PRAGMA_THREADS_FOR {
 
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
 
                             void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
                             void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@@ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
                 }
                 else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
 
                             void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
                             void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@@ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
 
                 auto func = PRAGMA_THREADS_FOR {
 
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
 
                         void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
                         void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@@ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
 
                 auto func = PRAGMA_THREADS_FOR {
 
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
 
                         void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
                         void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
index 9e3bdf885..fc6fc768b 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
@@ -56,7 +56,7 @@ namespace nd4j {
 
                 if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto _x = static_cast<unsigned long long>(xBuffer[e]);
                             auto _y = static_cast<unsigned long long>(yBuffer[e]);
 
@@ -67,7 +67,7 @@ namespace nd4j {
                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
                             auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);
 
@@ -78,7 +78,7 @@ namespace nd4j {
                     maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                 } else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
                             auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
index 04df86c36..beb48e382 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@@ -42,7 +42,7 @@ namespace nd4j {
 
                 // we divide array into 32 element chunks, and store intermediate results once
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto b = 0; b < stop; b += increment) {
+                    for (auto b = 0; b < stop; b++) {
                         auto blockBuffer = buffer + b * numBlocks;
 
                         Nd4jLong r = 1;
@@ -64,7 +64,7 @@ namespace nd4j {
 
 
                     auto func2 = PRAGMA_THREADS_FOR {
-                        for (auto b = start; b < stop; b += increment) {
+                        for (auto b = start; b < stop; b++) {
                             auto blockBuffer = tempBuffer + b * numBlocks;
 
                             Nd4jLong r = 1;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
index 9d30ddcf7..23acab375 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@@ -280,7 +280,7 @@ namespace helpers {
         int xsSize = xs.size();
         // Scale x interpolation weights to avoid a multiplication during iteration.
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 xs[i]._bottomIndex *= channels;
                 xs[i]._topIndex *= channels;
             }
@@ -906,7 +906,7 @@ namespace helpers {
         auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with  template <typename X, typename Z> declaration
 
         auto batchProcess = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 for (auto y = 0; y < st.outHeight; ++y) {
                     const float inY = y * st.heightScale;
                     const float inY1 = (y + 1) * st.heightScale;
@@ -961,7 +961,7 @@ namespace helpers {
             if (Status::OK() == res) {
                 std::vector<CachedInterpolation> xCached(st.outWidth);
                 auto cachingProcedure = PRAGMA_THREADS_FOR {
-                    for (auto x = start; x < stop; x += increment) {
+                    for (auto x = start; x < stop; x++) {
                         auto &xCache = xCached[x];
                         const float inX = x * st.widthScale;
                         const float inX1 = (x + 1) * st.widthScale;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
index e065174d5..b98e7f026 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
@@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
         'c' == output.ordering() && 1 == output.ews()){
 
         auto func = PRAGMA_THREADS_FOR{
-             for (auto i = start; i < stop; i += increment) {
+             for (auto i = start; i < stop; i++) {
                  const auto xStep = i*3;
                  z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2];
              }
@@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
     auto func = PRAGMA_THREADS_FOR{
 
          Nd4jLong coords[MAX_RANK];
-         for (auto i = start; i < stop; i += increment) {
+         for (auto i = start; i < stop; i++) {
              shape::index2coords(i, output.getShapeInfo(), coords);
              const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
              const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords);
@@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
     const Nd4jLong zDimCstride = output.stridesOf()[dimC];
 
     auto func = PRAGMA_THREADS_FOR{
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             const T* xTad = x + packX.platformOffsets()[i];
             T* zTad = z + packZ.platformOffsets()[i];
             op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 const T* xTad = x + packX.platformOffsets()[i];
                 T* zTad = z + packZ.platformOffsets()[i];
                 op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
         const Nd4jLong zDimCstride = output->stridesOf()[dimC];
 
         auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 const T* xTad = x + packX.platformOffsets()[i];
                 T* zTad = z + packZ.platformOffsets()[i];
                 //simple M*v //tr.T*v
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index 4bc9d3304..1fea8e4fe 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
         int span = (tads / num_threads) + 8;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
+            for (auto r = start; r < stop; r++) {
                     auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
                     auto rZ = output->bufferAsT<Z>() + zOfsets[r];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
index c9b833cf5..aeb9e38b0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
     if(inTadEws == 1 && outTadEws == 1) {
         
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                 const T *x = inBuff + inTadOffsets[i];
                 T *y = outBuff + outTadOffsets[i];
 
@@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     if(inTadEws == 1 && gradITadEws == 1) {
         
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                 const X *x = inBuff + inTadOffsets[i];
                       Y *y = gradIBuff + gradITadOffsets[i];
 
@@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     else {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                 const X *x = inBuff + inTadOffsets[i];
                       Y *y = gradIBuff + gradITadOffsets[i];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 683a82392..634d875d2 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
     auto h_ = h->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (uint e = start; e < stop; e += increment) {
+        for (uint e = start; e < stop; e++) {
             c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
             h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
index 2856e73b9..7d2eb5051 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@@ -45,7 +45,7 @@ namespace helpers {
             auto n = shape::sizeAt(matrixShape, -1);
 
             auto loop = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     Nd4jLong theFirstPos[] = {theFirst, i};
                     Nd4jLong theSecondPos[] = {theSecond, i};
                     auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0);
@@ -203,7 +203,7 @@ namespace helpers {
         auto result = -1;
         //auto loop = PRAGMA_THREADS_FOR {
             auto start = column, stop = rowNum, increment = 1;
-            for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) {
+            for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
                 Nd4jLong xPos[] = {rowCounter, column};
                 auto xIndex = shape::getOffset(compoundShape, xPos, 0);
                 if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
@@ -221,7 +221,7 @@ namespace helpers {
         Nd4jLong xDiag[] = {currentRow, currentRow};
         auto diagIndex = shape::getOffset(compoundShape, xDiag, 0);
         auto loop = PRAGMA_THREADS_FOR {
-            for (int j = start; j < stop; j += increment) {
+            for (auto j = start; j < stop; j++) {
                 Nd4jLong xRow[] = {j, currentRow};
                 auto rowIndex = shape::getOffset(compoundShape, xRow, 0);
                 compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
@@ -310,7 +310,7 @@ namespace helpers {
             permutations = permutationVectors->allTensorsAlongDimension({-1});
 
         auto loop = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n);
             }
         };
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
index cc43c1866..8a2048263 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
@@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
     int lO = listOut.size();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             for (int j = 0; j < lastDimension; ++j)
                 listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j));
     };
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
index dcca5075e..20d8bd34f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
@@ -55,7 +55,7 @@ namespace helpers {
             Nd4jLong oL = output->lengthOf();
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto row = rows.at(e);
                     output->p(e, row->e<T>(n));
                 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
index 3e18d6d14..71beed7f9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@@ -49,7 +49,7 @@ namespace nd4j {
 
                 if (tadEws >= 1) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = 0; e < stop; e += increment) {
+                        for (auto e = 0; e < stop; e++) {
                             auto cO = output + tadPack.primaryOffsets()[e];
 
                             auto idx = static_cast<int>(indices[e]);
@@ -70,7 +70,7 @@ namespace nd4j {
                     samediff::Threads::parallel_tad(func, 0, numTads);
                 } else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto cO = output + tadPack.primaryOffsets()[e];
 
                             auto idx = static_cast<int>(indices[e]);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
index fc572677e..df80636ee 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
@@ -70,7 +70,7 @@ template <typename T>
 static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {
 
 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
         	const T order = n.e<T>(i);
         	if(order != static_cast<int>(order))						// if order has fractional part then do not perform calculations and return NAN
         		output.p(i, std::numeric_limits<T>::quiet_NaN());
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
index 90b69ca6f..9e1980e54 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
@@ -113,7 +113,7 @@ namespace helpers {
         ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
         ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
         auto batching = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 //qr here
                 qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies);
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
index bb0e7e24e..a14fb89f9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
@@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
     auto d = delta.e<T>(0);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             buff[i] = s + i * d;
     };
     samediff::Threads::parallel_for(func, 0, len);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 9ee906bd5..4c80e3bf2 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
             if (inArr == outArr) {
                 if (inEWS == 1) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto idx = sLength - e;
                             swap(inArr, e, idx);
                         }
@@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 }
                 else if (inEWS > 1) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto idx1 = (sLength - e) * inEWS;
                             Nd4jLong idx2 = e * inEWS;
                             swap(inArr, idx1, idx2);
@@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 else {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                             auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
                             swap(outArr, inOffset, outOffset);
@@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (Nd4jLong e = start; e < stop; e += increment)
+                        for (Nd4jLong e = start; e < stop; e++)
                             outArr[sLength - e] = inArr[e];
                     };
                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
                         auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment)
+                            for (auto e = start; e < stop; e++)
                                 outArr[e] = inArr[e];
                         };
                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment)
+                        for (auto e = start; e < stop; e++)
                             outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
                     };
                     samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);
 
                     if(inLength != numOfElemsToReverse) {
                         auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment)
+                            for (auto e = start; e < stop; e++)
                                 outArr[e * outEWS] = inArr[e * inEWS];
                         };
                         samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 else {
 
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                             auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
                             outArr[outOffset] = inArr[inOffset];
@@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                     if(inLength != numOfElemsToReverse) {
 
                         auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment) {
+                            for (auto e = start; e < stop; e++) {
                                 auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                                 auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
                                 outArr[outOffset] = inArr[inOffset];
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index 01e346136..09a628b84 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
     // loop through input array
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
 
             shape::index2coords(i, output.getShapeInfo(), coords);
 
@@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
     // loop through output array
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             shape::index2coords(i, output.getShapeInfo(), coords);
 
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
index fd285ed9c..557d63fd3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
@@ -48,7 +48,7 @@ namespace helpers {
             const int total_count = batch_size * input_height * input_width * input_depth;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
                     // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
                     const int d = inp_idx % input_depth;
                     const int inp_idx2 = inp_idx / input_depth;
@@ -74,7 +74,7 @@ namespace helpers {
             const int total_count = batch_size * output_depth_by_output_area;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
                     const int n_iC_oY_bY_oX = inp_idx / block_size;
                     const int bX = inp_idx - n_iC_oY_bY_oX * block_size;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index a3f0c01be..2de2b2d22 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
 
         Nd4jLong xCoords[MAX_RANK];
 
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
 
             shape::index2coords(i, xShapeInfo, xCoords);
 
@@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
 
     if(outRank == 1) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 Nd4jLong idx = indices.e<Nd4jLong>(i);
                 NDArray out = output({idx, idx + 1});
 
@@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
         std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
                 NDArray updSubArr = updates(i, dimsToExcludeUpd);
 
@@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
 
     if(outRank == 1) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 Nd4jLong idx = indices.e<Nd4jLong>(i);
                 NDArray out = output({idx, idx + 1});
 
@@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
         auto func = PRAGMA_THREADS_FOR {
             std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);
 
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 NDArray indSubArr = indices(i, dimsToExcludeInd);
 
                 for (Nd4jLong j = 0; j < indLastDim; ++j) {
@@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
 
     if(!calcGrad) {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto subArr = updates(i, dimsToExclude);
                 output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
             }
@@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
         samediff::Threads::parallel_for(func, 0, indicesLen);
     } else {
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto subArr = updates(i, dimsToExclude);
                 auto ind = indices.e<Nd4jLong>(i);
                 subArr.p(ind, subArr.e(ind) - 1.);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
index e20145735..08aafc98c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@@ -169,7 +169,7 @@ namespace helpers {
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e));
                         }
                     };
@@ -223,7 +223,7 @@ namespace helpers {
             for (int i = 0; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e));
                         }
                     };
@@ -272,7 +272,7 @@ namespace helpers {
             for (int i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i)  == idx) {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                             sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e));
                         }
                     };
@@ -625,7 +625,7 @@ namespace helpers {
             Nd4jLong loop_size = input->lengthOf();
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
                         output->p(e, gradOut->e<T>(classNum));
@@ -645,7 +645,7 @@ namespace helpers {
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -675,7 +675,7 @@ namespace helpers {
         segmentMinFunctor(context, input, indices, &tempRes);
         if (input->isVector()) {
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
                         output->p(e, gradOut->e<double>(classNum));
@@ -697,7 +697,7 @@ namespace helpers {
             int pos = 0;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -887,7 +887,7 @@ namespace helpers {
         if (input->isVector()) {
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
                         output->t<T>(e) = gradOut->t<T>(classNum);
@@ -1004,7 +1004,7 @@ namespace helpers {
         unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
         if (input->isVector()) {
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e));
                 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
index 59c257c28..05353bf5e 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@@ -364,7 +364,7 @@ namespace nd4j {
                     auto func = PRAGMA_THREADS_FOR {
                         T sneu1e[600];
 
-                        for (auto t = start; t < stop; t += increment) {
+                        for (auto t = start; t < stop; t++) {
                             T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
                             memset(neu1e, 0, vectorLength * sizeof(T));
 
@@ -457,7 +457,7 @@ namespace nd4j {
                     T sneu1[600];
                     T sneu1e[600];
 
-                    for (int e = start; e < stop; e += increment) {
+                    for (int e = start; e < stop; e++) {
                         T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
                         T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
index 48f7f0d9a..c8774f028 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
@@ -40,7 +40,7 @@ namespace helpers {
         output->assign(input);
 
         auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 for (auto r = 0; r < rows; r++) {
                     for (auto c = 0; c < r; c++) {
                         math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
index 642dd37da..d2dd3bf30 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
@@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
     T* pCt   = ct->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto col = start; col < stop; col += increment) {
+        for (auto col = start; col < stop; col++) {
             const auto colNum = col % d2;
             bool flip = colNum >= K;
             T maskVal = mask ? *(pMask + col) : T(1);
@@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
     T* pGradInit  = gradC0->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto col = start; col < stop; col += increment) {
+        for (auto col = start; col < stop; col++) {
             T gbF = 0.f;
             T gbR = 0.f;
             const auto colNum = col % d2;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
index db9b6afff..a3d27702d 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
@@ -37,7 +37,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 	    int inSize = inArrs.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment)
+            for (auto i = start; i < stop; i++)
                 outArr->p<T>(i, inArrs[i]->t<T>(0));
         };
 
@@ -50,7 +50,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
         int listSize = list.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment)
+            for (auto i = start; i < stop; i++)
                 list.at(i)->assign(inArrs[i]);
         };
         samediff::Threads::parallel_tad(func, 0, listSize);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
index e38232928..c4b45b398 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@@ -150,7 +150,7 @@ namespace helpers {
             result->assign(0);
             if (status == ND4J_STATUS_OK) {
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto e = start; e < stop; e += increment) {
+                    for (auto e = start; e < stop; e++) {
                         bool found = false;
                         for (int j = 0; j < k; j++) {
                             if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index ea5e90cd8..1f630e8e0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
     int dLen = dOdI.lengthOf();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             if (dOdI.t<T>(i) != static_cast<T>(0.f))
                 dOdI.t<T>(i) = static_cast<T>(1.f);
         }
@@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) {
     auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             output.p(i, setOfSubArrs.at(i)->getTrace());
     };
     samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
@@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
         auto func = PRAGMA_THREADS_FOR {
             Nd4jLong coords[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 shape::index2coords(i, output.getShapeInfo(), coords);
                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
@@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
         auto func = PRAGMA_THREADS_FOR {
             Nd4jLong coords[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 shape::index2coords(i, output.getShapeInfo(), coords);
                 const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
@@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
 
     auto func = PRAGMA_THREADS_FOR {
         Nd4jLong coords[MAX_RANK * 3];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             Nd4jLong *zCoordStart, *xCoordStart;
 
             if (yLastDim == xRank) {
@@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
         else if (input->rankOf() == 1 && indices->isVector()) {
             // special case
             auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment)
+                for (auto e = start; e < stop; e++)
                     output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
             };
 
@@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     NDArray subArrOut = (*output)(i, dimsOut);
                     NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
                     subArrOut.assign(subArrIn);
@@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
             const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     NDArray subArrOut = (*output)(i, {axis});
                     NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
                     subArrOut.assign(subArrIn);
@@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
     auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             arrs.at(i)->setIdentity();
     };
 
@@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
         indices.push_back((*intArgs)[e]);
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
             auto inSubArr = input(indices[i], dimsToExclude, true);
             auto updSubArr = updates(i, dimsToExclude, true);
 
@@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input
 
         case 6: {   // copy
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto inSubArr = input(i, dimensions);
                     inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
                 }
@@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T max = -DataTypeUtils::max<T>();
             Nd4jLong idx = 0;
 
@@ -839,7 +839,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T max = -DataTypeUtils::max<T>();
             for (int i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
@@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T sum = 0.;
             for (int i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
@@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto x = inArrs[0];
 
     auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             T sum = (T) 0.f;
             for (int i = 0; i < numArgs; i++)
                 sum += inArrs[i]->e<T>(e);
@@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
             auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     const T iNormActual = norm2.e<T>(i);
                     if (iNormActual > normClip)
                         *listOfInSubArrs.at(i) *= normClip / iNormActual;
@@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
             auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     auto inputSubArr = listOfInSubArrs.at(i);
                     auto outputSubArr = listOfOutSubArrs.at(i);
                     outputSubArr->assign(inputSubArr);
@@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
         auto cn = clipNorm.e<T>(0);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 T N = norm2.e<T>(i);
 
                 auto gradOSubArr = gradOSubArrs.at(i);
@@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
         auto func = PRAGMA_THREADS_FOR {
             Nd4jLong inIdx[MAX_RANK];
             Nd4jLong outIdx[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 shape::index2coords(i, output.getShapeInfo(), outIdx);
 
                 for (int j = 0; j < rank; ++j) {
@@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
 
     BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES);
 
-//////////////////////////////////////////////////////////////////////////
-template<typename T>
-static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-    nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
-}
-
-    void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
-    }
-
-    BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
 
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
index ceb228439..c825a8fee 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
@@ -90,7 +90,7 @@ namespace helpers {
         auto outputPart = output->allTensorsAlongDimension({-2, -1});
 
         auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 if (lower) {
                     lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]);
                 } else {
@@ -112,7 +112,7 @@ namespace helpers {
         auto rows = input->sizeAt(-2);
 
         auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                 if (!lower) {
                     for (auto r = 0; r < rows; r++) {
                         for (auto c = 0; c <= r; c++) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
index 5d4ed9f2e..90ef634c1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
@@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
 	int xLen = x.lengthOf();
 
 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
             z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
     };
 
diff --git a/libnd4j/include/ops/declarable/helpers/cross.h b/libnd4j/include/ops/declarable/helpers/cross.h
index 02b7e8467..3ea80966b 100644
--- a/libnd4j/include/ops/declarable/helpers/cross.h
+++ b/libnd4j/include/ops/declarable/helpers/cross.h
@@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
         int tads = tadsA.size();
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 auto a_ = tadsA.at(e);
                 auto b_ = tadsB.at(e);
                 auto o_ = tadsO.at(e);
diff --git a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
index 8ef63101e..3bcdea865 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
@@ -69,7 +69,7 @@ namespace helpers {
         }
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 values->p(e, static_cast<T>(valuesVector[e]));
                 if (counts != nullptr)
                     counts->p(e, countsMap[valuesVector[e]]);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
index 1a35ecd47..8ef8032bb 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
@@ -19,8 +19,10 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
+
+    BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
 }
\ No newline at end of file
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
index be8edad04..5bb518d76 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
index 915983bb0..27b68e732 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
index d2f59137d..80e2258c7 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
index 29caeae84..e34b0c528 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
index 489d1fc6a..96797cc98 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
index 6f50c4682..70c7f3990 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
index 03a31221f..e2d1df0e9 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
index 074f09238..25e14d39f 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
index 8de7c663b..f3b4cbcb6 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_double.hpp"
 
 namespace nd4j {
     BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
index 3e841dfae..4d1575123 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
index 59a215c20..b50c487b7 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
index 77617173d..972b936dd 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
index 2c19c3bc6..9eb99b238 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
index cd6babb61..6558d7284 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
index b54028b42..d89652899 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
index 4ca54e7b1..40c9598ee 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
index 3d843ca4c..e49ace221 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
index d8dc34f1c..973b25edc 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);
diff --git a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
index 2c12f2803..b3bf0beeb 100644
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
@@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //
 
-#include "../specials.hpp"
+#include "../specials_single.hpp"
 
 namespace nd4j {
     BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);
diff --git a/libnd4j/include/ops/impl/gemm.cpp b/libnd4j/include/ops/impl/gemm.cpp
index 2779bdadf..efd57a7c5 100644
--- a/libnd4j/include/ops/impl/gemm.cpp
+++ b/libnd4j/include/ops/impl/gemm.cpp
@@ -34,7 +34,7 @@ namespace nd4j {
 
             // handle transpose in parallel
             auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
                     for (int c = 0; c < cols; c++) {
                         int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
                         int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
@@ -73,7 +73,7 @@ namespace nd4j {
                         C[r] = z;
                 } else {
                     auto func = PRAGMA_THREADS_FOR {
-                        for (auto r = start; r < stop; r += increment)
+                        for (auto r = start; r < stop; r++)
                             C[r] = z;
                     };
                     samediff::Threads::parallel_for(func, 0, length);
@@ -130,7 +130,7 @@ namespace nd4j {
             auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
                     int aIdx = linearIndexC(M, N, r, 0);
                     auto aX = aT + aIdx;
 
diff --git a/libnd4j/include/ops/impl/specials_double.hpp b/libnd4j/include/ops/impl/specials_double.hpp
new file mode 100644
index 000000000..73f50c772
--- /dev/null
+++ b/libnd4j/include/ops/impl/specials_double.hpp
@@ -0,0 +1,270 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com, created on 07.10.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include <pointercast.h>
+#include <helpers/shape.h>
+#include <helpers/TAD.h>
+#include <specials.h>
+#include <dll.h>
+#include <NDArray.h>
+#include <ops/declarable/CustomOperations.h>
+#include <types/types.h>
+#include <helpers/Loops.h>
+
+namespace nd4j {
+
+
+    template<typename S, typename T>
+    void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
+        auto x = reinterpret_cast<S *>(dx);
+        auto z = reinterpret_cast<T *>(dz);
+
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i++) {
+                z[i] = static_cast<T>(x[i]);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, N);
+    };
+
+
+    template <typename X, typename Y>
+    void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
+        int i = left, j = right;
+        X ktmp;
+        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
+
+        Y vtmp;
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
+                        i++;
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
+                        i++;
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+
+    template <typename X, typename Y>
+    void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
+        int i = left, j = right;
+        X ktmp;
+        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
+
+        Y vtmp;
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
+                        i++;
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
+                        i++;
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+
+    template <typename X, typename Y>
+    static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<X *>(varray);
+        auto values = reinterpret_cast<Y *>(yarray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+    }
+
+    template <typename X, typename Y>
+    static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<X *>(varray);
+        auto values = reinterpret_cast<Y *>(yarray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
+        quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
+        quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
+        auto x = reinterpret_cast<X*>(vx);
+        auto y = reinterpret_cast<Y*>(vy);
+
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+
+        auto xLength = shape::length(xShapeInfo);
+        auto xTadLength = shape::length(packX.primaryShapeInfo());
+        auto numTads = packX.numberOfTads();
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
+
+                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
+        auto x = reinterpret_cast<X*>(vx);
+        auto y = reinterpret_cast<Y*>(vy);
+
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+
+        auto xLength = shape::length(xShapeInfo);
+        auto xTadLength = shape::length(packX.primaryShapeInfo());
+        auto numTads = packX.numberOfTads();
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
+
+                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+}
+
diff --git a/libnd4j/include/ops/impl/specials.hpp b/libnd4j/include/ops/impl/specials_single.hpp
similarity index 56%
rename from libnd4j/include/ops/impl/specials.hpp
rename to libnd4j/include/ops/impl/specials_single.hpp
index 207ca5964..030e9c6d7 100644
--- a/libnd4j/include/ops/impl/specials.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@@ -64,7 +64,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
                 T* outBuff = output.bufferAsT<T>();
 
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto r = start; r < stop; r += increment) {
+                    for (auto r = start; r < stop; r++) {
                         const Nd4jLong arrLen = inArrs[r]->lengthOf();
                         const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
 
@@ -99,7 +99,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
         }
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 auto temp = output(indices[i], true);
                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
             }
@@ -143,7 +143,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
         auto x = reinterpret_cast<T **>(vx);
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                 for (auto ar = 0L; ar < n; ar++) {
                     z[i] += x[ar][i];
                 }
@@ -179,7 +179,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
             }
 
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     for (Nd4jLong ar = 1; ar < n; ar++) {
                         z[i] += x[ar][i] / static_cast<T>(n);
                     }
@@ -199,7 +199,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
 
             // aggregation step
             auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                     for (Nd4jLong ar = 0; ar < n; ar++) {
                         z[i] += x[ar][i] / static_cast<T>(n);
                     }
@@ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         int numTads = xLength / xTadLength;
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
+            for (auto r = start; r < stop; r++) {
                 T *dx = x + tadOffsets[r];
 
                 quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
@@ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
 
 
         auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                 for (int bitId = 0; bitId < 16; bitId++) {
                     bool hasBit = (x[e] & 1 << (bitId)) != 0;
                     bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
@@ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         samediff::Threads::parallel_for(func, 4, lim);
     }
 
-    template<typename S, typename T>
-    void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
-        auto x = reinterpret_cast<S *>(dx);
-        auto z = reinterpret_cast<T *>(dz);
-
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
-                z[i] = static_cast<T>(x[i]);
-            }
-        };
-
-        samediff::Threads::parallel_for(func, 0, N);
-    };
-    BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
-
     template<typename T>
     Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
         auto dx = reinterpret_cast<T *>(vx);
@@ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
         };
         return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
     }
-
-    template <typename X, typename Y>
-    void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        int i = left, j = right;
-        X ktmp;
-        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
-
-        Y vtmp;
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
-                        i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
-                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
-                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
-                        i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
-                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
-                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-
-    template <typename X, typename Y>
-    void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        int i = left, j = right;
-        X ktmp;
-        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
-
-        Y vtmp;
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
-                        i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
-                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
-                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
-                        i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
-                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
-                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-
-    template <typename X, typename Y>
-    static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<X *>(varray);
-        auto values = reinterpret_cast<Y *>(yarray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-    }
-
-    template <typename X, typename Y>
-    static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<X *>(varray);
-        auto values = reinterpret_cast<Y *>(yarray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
-        quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
-        quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
-        auto x = reinterpret_cast<X*>(vx);
-        auto y = reinterpret_cast<Y*>(vy);
-
-        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
-
-        auto xLength = shape::length(xShapeInfo);
-        auto xTadLength = shape::length(packX.primaryShapeInfo());
-        auto numTads = packX.numberOfTads();
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                auto dx = x + packX.primaryOffsets()[r];
-                auto dy = y + packY.primaryOffsets()[r];
-
-                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
-        auto x = reinterpret_cast<X*>(vx);
-        auto y = reinterpret_cast<Y*>(vy);
-
-        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
-
-        auto xLength = shape::length(xShapeInfo);
-        auto xTadLength = shape::length(packX.primaryShapeInfo());
-        auto numTads = packX.numberOfTads();
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                auto dx = x + packX.primaryOffsets()[r];
-                auto dy = y + packY.primaryOffsets()[r];
-
-                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-    //BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
-    //BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES);
 }
 
diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h
index a25aa36ec..354f8e328 100644
--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@@ -167,7 +167,7 @@ namespace randomOps {
 
             if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
                 auto func = PRAGMA_THREADS_FOR {
-                    for (uint64_t e = start; e < stop; e += increment) {
+                    for (auto e = start; e < stop; e++) {
                         T prob = rng->relativeT<T>(e);
                         T cumProb = (T) 0.0f;
                         for (Nd4jLong f = 0; f < yLength; f++) {
@@ -330,7 +330,7 @@ namespace randomOps {
             const T epsilon = static_cast<T>(1e-5);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto epm = e + middle;
 
                     // we need to get random values
@@ -440,7 +440,7 @@ namespace randomOps {
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             auto func = PRAGMA_THREADS_FOR {
-                for (Nd4jLong e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -549,7 +549,7 @@ namespace randomOps {
             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
 
                     int success = 0;
                     for (int t = 1; t <= trials; t++) {
@@ -690,7 +690,7 @@ namespace randomOps {
             const T epsilon = static_cast<T>(1e-5);
 
             auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     if (z[e] > mean + ds || z[e] < mean - ds) {
                         z[e] = step(rng, mean, stddev, e, middle, z[e]);
 
@@ -818,7 +818,7 @@ namespace randomOps {
 
             auto func = PRAGMA_THREADS_FOR {
                 PRAGMA_OMP_SIMD
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                     auto epm = e + middle;
 
                     // we need to get random values