Minor improvements (#255)

* static increments in loops Signed-off-by: raver119 <raver119@gmail.com> * specials and concat split into separate units Signed-off-by: raver119 <raver119@gmail.com>
2020-02-20 11:43:26 +03:00 · 2020-02-20 11:43:26 +03:00 · 215641ea9e
commit 215641ea9e
parent d9058b469a
83 changed files with 529 additions and 464 deletions
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@ -501,7 +501,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);

    auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
                auto cdata = data + offsets[e];
                if (dataType == DataType::UTF16) {
                    unicode::utf8to16(string[e], cdata, std::char_traits<char>::length(string[e]));
@ -568,7 +568,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::stri
    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);

    auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dataType == DataType::UTF16) {
                 unicode::utf8to16(string[e].data(), cdata, string[e].size());
@ -635,7 +635,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u16s
    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);

    auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dtype == DataType::UTF16) {
                 memcpy(cdata, string[e].data(), string[e].size() * sizeof(uint16_t));
@ -701,7 +701,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha


    auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
             auto cdata = data + offsets[e];
             if (dtype == DataType::UTF16) {
                 memcpy(cdata, string[e], std::char_traits<char16_t>::length(string[e]) * sizeof(uint16_t));
@ -767,7 +767,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<std::u32s
    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);

    auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            auto cdata = data + offsets[e];
            if (dtype == DataType::UTF16) {
                unicode::utf32to16(string[e].data(), cdata, string[e].size());
@ -833,7 +833,7 @@ NDArray::NDArray(const std::vector<Nd4jLong>& shape, const std::vector<const cha
    auto data = reinterpret_cast<int8_t*>(bufferAsT<int8_t>() + headerLength);

    auto func = PRAGMA_THREADS_FOR{
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            auto cdata = data + offsets[e];
            if (dtype == DataType::UTF16) {
                unicode::utf32to16(string[e], cdata, std::char_traits<char32_t>::length(string[e]));
@ -2367,7 +2367,7 @@ NDArray NDArray::asS() const {
    const auto inData = bufferAsT<int8_t>() + offsetsLength;

    auto func = PRAGMA_THREADS_FOR{
-        for (int e = start; e < stop; e += increment) {
+        for (int e = start; e < stop; e++) {
           auto cdata = outData + offsets[e];
           auto end = nInputoffsets[e + 1];
           auto idata = inData + nInputoffsets[e];
@ -3466,7 +3466,7 @@ NDArray NDArray::dup(const char newOrder) const {
            std::vector<std::string> strings(lengthOf());

            auto func = PRAGMA_THREADS_FOR{
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
                           strings[i] = std::move(this->e<std::string>(i));
                    }
            };
@ -3479,7 +3479,7 @@ NDArray NDArray::dup(const char newOrder) const {
            std::vector<std::u16string> strings(lengthOf());

            auto func = PRAGMA_THREADS_FOR{
-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {
                           strings[i] = std::move(this->e<std::u16string>(i));
                    }
            };
@ -3491,7 +3491,7 @@ NDArray NDArray::dup(const char newOrder) const {

        std::vector<std::u32string> strings(lengthOf());
        auto func = PRAGMA_THREADS_FOR{
-               for (auto i = start; i < stop; i += increment) {
+               for (auto i = start; i < stop; i++) {
                      strings[i] = std::move(this->e<std::u32string>(i));
               }
        };
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@ -98,7 +98,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t

    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            shape::index2coords(i, target.getShapeInfo(), coords);
            const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);

@ -152,7 +152,7 @@ static void templatedSwap(void *xBuffer, void *yBuffer, Nd4jLong length) {
    auto y = reinterpret_cast<T *>(yBuffer);

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            auto temp = x[i];
            x[i] = y[i];
            y[i] = temp;
@ -266,7 +266,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
    if(result.ordering() == 'c') {           //  ews == 1 always here

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
                BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), i, this->getBuffer(), yOffset), LIBND4J_TYPES);
            }
@ -277,7 +277,7 @@ NDArray NDArray::tile(const std::vector<Nd4jLong>& reps) const {
    else {

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto xOffset = result.getOffset(i);
                auto yOffset = shape::subArrayOffset(i, newShapeInfo, getShapeInfo());
                BUILD_SINGLE_SELECTOR(xType, this->template templatedAssign,(result.getBuffer(), xOffset, this->getBuffer(), yOffset), LIBND4J_TYPES);
@ -377,7 +377,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
    // loop through input array
    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            shape::index2coords(i, output.getShapeInfo(), coords);

            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
--- a/libnd4j/blas/cpu/NDArrayLambda.hpp
+++ b/libnd4j/blas/cpu/NDArrayLambda.hpp
@ -22,7 +22,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
    if (this->ordering() == second.ordering() && this->ordering() == third.ordering()  && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == second.ews() && this->ews() == third.ews()) {

        auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                z[e] = func(f[e], s[e], t[e]);
        };

@ -31,7 +31,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
        if (f == z) {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto tOffset = this->getOffset(e);
                    auto uOffset = second.getOffset(e);
                    auto vOffset = third.getOffset(e);
@ -44,7 +44,7 @@ void NDArray::applyTriplewiseLambda(NDArray& second, NDArray& third, const std::
        } else {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto tOffset = this->getOffset(e);
                    auto uOffset = second.getOffset(e);
                    auto vOffset = third.getOffset(e);
@ -93,7 +93,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
    if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {

        auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                z[e] = func(f[e], s[e]);
        };

@ -102,7 +102,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
        if (f == z) {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);
                    auto yOffset = other.getOffset(e);

@ -114,7 +114,7 @@ void NDArray::applyPairwiseLambda(const NDArray& other, const std::function<T(T,
        } else {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);
                    auto yOffset = other.getOffset(e);
                    auto zOffset = target.getOffset(e);
@ -156,7 +156,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
    if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {

        auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                z[e] = func(f[e]);
        };

@ -165,7 +165,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
        if (f == z) {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);

                    f[xOffset] = func(f[xOffset]);
@ -176,7 +176,7 @@ void NDArray::applyLambda(const std::function<T(T)>& func, NDArray& target) {
        } else {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);
                    auto zOffset = target.getOffset(e);

@ -217,7 +217,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
    if (this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1)) {

        auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                z[e] = func(e, f[e]);
        };

@ -226,7 +226,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
        if (f == z) {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);

                    f[xOffset] = func(e, f[xOffset]);
@ -237,7 +237,7 @@ void NDArray::applyIndexedLambda(const std::function<T(Nd4jLong, T)>& func, NDAr
        } else {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);
                    auto zOffset = target.getOffset(e);

@ -283,7 +283,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
    if (this->ordering() == other.ordering() && this->ordering() == target.ordering() && (this->ews() == 1 && target.ews() == 1) && this->ews() == other.ews()) {

        auto loop = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment)
+            for (auto e = start; e < stop; e++)
                z[e] = func((Nd4jLong) e, f[e], s[e]);
        };

@ -292,7 +292,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
        if (f == z) {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);
                    auto yOffset = other.getOffset(e);

@ -304,7 +304,7 @@ void NDArray::applyIndexedPairwiseLambda(NDArray& other, const std::function<T(N
        } else {

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto xOffset = this->getOffset(e);
                    auto yOffset = other.getOffset(e);
                    auto zOffset = target.getOffset(e);
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@ -1291,7 +1291,7 @@ void pullRowsGeneric(void *vx,
    _threads = nd4j::math::nd4j_min<int>(_threads, nd4j::Environment::getInstance()->maxThreads());

    auto func = PRAGMA_THREADS_FOR {
-        for (auto idx = start; idx < stop; idx += increment) {
+        for (auto idx = start; idx < stop; idx++) {
            auto xTadOffsetForBlock = tadOffsets[indexes[idx]];
            auto zTadOffsetForBlock = zTadOffsets[idx];

@ -1356,7 +1356,7 @@ void tearGeneric(void *vx,
    auto numTads = shape::length(hXShapeInfo) / tadLength;

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            auto hZ = reinterpret_cast<T *>(targets[i]);
            auto s = hX + tadOffsets[i];

@ -1478,7 +1478,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
    auto dZ = reinterpret_cast<T **>(dz);

    auto func = PRAGMA_THREADS_FOR {
-        for (auto f = start; f < stop; f += increment) {
+        for (auto f = start; f < stop; f++) {
            auto hX = reinterpret_cast<T *>(dX[f]);
            //auto hZ = reinterpret_cast<T *>(dZ[f]);

--- a/libnd4j/include/array/DataTypeConversions.h
+++ b/libnd4j/include/array/DataTypeConversions.h
@ -52,7 +52,7 @@ namespace nd4j {
                                TypeCast::convertGeneric<T2, T>(nullptr, tmp, length, buffer);
 #else
                auto func = PRAGMA_THREADS_FOR {
-                    for (auto e = start; e < stop; e += increment)
+                    for (auto e = start; e < stop; e++)
                        buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                };

@ -110,7 +110,7 @@ namespace nd4j {
                                TypeCast::convertGeneric<float, T>(nullptr, tmp, length, buffer);
 #else
                            auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                            };

@ -138,7 +138,7 @@ namespace nd4j {

 #else
                            auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                            };

@ -164,7 +164,7 @@ namespace nd4j {
                                TypeCast::convertGeneric<float16, T>(nullptr, tmp, length, buffer);
 #else
                            auto func = PRAGMA_THREADS_FOR {
-                                for (auto e = start; e < stop; e += increment)
+                                for (auto e = start; e < stop; e++)
                                    buffer[e] = canKeep ? static_cast<T>(tmp[e]) : BitwiseUtils::swap_bytes<T>(static_cast<T>(tmp[e]));
                            };

--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.hpp
@ -49,7 +49,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
        case nd4j::LoopKind::EWS1: {

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -70,7 +70,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
        case nd4j::LoopKind::EWSNONZERO: {

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -91,7 +91,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
        case nd4j::LoopKind::RANK1: {

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -114,7 +114,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            shape::updateStrides(2, tadShape, newStride, 'c');

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -141,7 +141,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            shape::updateStrides(3, tadShape, newStride, 'c');

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -170,7 +170,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            shape::updateStrides(4, tadShape, newStride, 'c');

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -201,7 +201,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            shape::updateStrides(5, tadShape, newStride, 'c');

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -234,7 +234,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -258,7 +258,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

@ -284,7 +284,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
            const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto tad = const_cast<X *>(x) + tadOffsets[i];
                    auto indexValue = OpType::startingIndexValue(tad);

--- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
+++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
@ -80,7 +80,7 @@ namespace nd4j {
                int nLen = zArr.lengthOf() / yArr.sizeAt(-1);

                auto func = PRAGMA_THREADS_FOR{
-                     for (uint32_t total = start; total < stop; total += increment) {
+                     for (uint32_t total = start; total < stop; total++) {

                        uint32_t i = total / zDim1;
                        uint32_t j = total % zDim1;
--- a/libnd4j/include/loops/cpu/indexreduce.hpp
+++ b/libnd4j/include/loops/cpu/indexreduce.hpp
@ -73,7 +73,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
        auto func = PRAGMA_THREADS_FOR {
            intermediatery[thread_id] = OpType::startingIndexValue(x);

-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                IndexValue<X> curr(x[i], i);
                intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
            }
@ -88,7 +88,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
        auto func = PRAGMA_THREADS_FOR {
            intermediatery[thread_id] = OpType::startingIndexValue(x);

-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                IndexValue<X> curr(x[offset], i);
                intermediatery[thread_id] = OpType::update(intermediatery[thread_id], curr, extraParams);
--- a/libnd4j/include/loops/cpu/random.hpp
+++ b/libnd4j/include/loops/cpu/random.hpp
@ -75,7 +75,7 @@ namespace functions {

                    auto func = PRAGMA_THREADS_FOR {
                        PRAGMA_OMP_SIMD
-                        for (auto i = start; i < stop; i += increment)  {
+                        for (auto i = start; i < stop; i++)  {
                            auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                            z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                        }
@ -93,7 +93,7 @@ namespace functions {

                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                        z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@ -111,7 +111,7 @@ namespace functions {

                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                        z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@ -129,7 +129,7 @@ namespace functions {

                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                        z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@ -149,7 +149,7 @@ namespace functions {

                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@ -197,7 +197,7 @@ namespace functions {
                else{
                    auto func = PRAGMA_THREADS_FOR {
                        PRAGMA_OMP_SIMD
-                        for (uint64_t i = start; i < stop; i += increment)  {
+                        for (uint64_t i = start; i < stop; i++)  {
                            auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                            z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                        }
@ -213,7 +213,7 @@ namespace functions {

                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                        z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@ -255,7 +255,7 @@ namespace functions {

                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i += increment)  {
+                    for (uint64_t i = start; i < stop; i++)  {
                        auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                        z[offset] = OpClass::op(i, length, rng, extraArguments);
                    }
--- a/libnd4j/include/loops/cpu/reduce3.hpp
+++ b/libnd4j/include/loops/cpu/reduce3.hpp
@ -88,7 +88,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,

    if (kindOfLoop == nd4j::LoopKind::EWS1) {
        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[i], y[i], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
            }
        };
@ -98,7 +98,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
    } else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
            }
@ -110,7 +110,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
        const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                intermediate[thread_id] = OpType::update(intermediate[thread_id], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * thread_id), extraParamsLocal + 3 * thread_id);
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@ -158,7 +158,7 @@ namespace functions {
            const bool canCast = tadEWS == 1 && tadOrder == 'c' ? false : nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeShapeInfo, tadShapeShapeInfoCast);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {

                    auto tadOffsetForBlock = tadPack.primaryOffsets()[r];
                    auto tx = x + tadOffsetForBlock;
--- a/libnd4j/include/loops/impl/type_conversions.cpp
+++ b/libnd4j/include/loops/impl/type_conversions.cpp
@ -81,7 +81,7 @@ namespace nd4j {

        // now we actually apply quantization
        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                rz[e] = static_cast<char>(nd4j::math::nd4j_round<float, char>( 1.0f * static_cast<float>(x[e]) / nd4j::math::nd4j_max<float>(amax, amin) * max_byte));
            }
        };
@ -177,7 +177,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
        int flimit = limit + 4;

        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                int el = x[e];
                int ael = nd4j::math::nd4j_abs<int>(el) - 1;
                z[ael] += el > 0 ? static_cast<T>(threshold) : static_cast<T>(-threshold);
@ -202,7 +202,7 @@ PRAGMA_OMP_ATOMIC_ARGS(write)
        auto z = reinterpret_cast<T *>(dz);

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                z[i] = static_cast<T>(static_cast<float>(x[i]));
            }
        };
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@ -153,7 +153,7 @@ namespace helpers {
        auto rowSize = sizeof(T) * colCount;

        auto func = PRAGMA_THREADS_FOR {
-            for (auto n = start; n < stop; n += increment) {
+            for (auto n = start; n < stop; n++) {
                int s = rowP->e<int>(n);
                int end = rowP->e<int>(n + 1);
                int shift = n * colCount;
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@ -291,7 +291,7 @@ static void softmax_(nd4j::LaunchContext * context, const NDArray& input, NDArra
            shape::calcOffsets(tadShapeInfo, offsets);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto inBuff = input.bufferAsT<T>() + tadOffsets[i];
                    auto outBuff = output.bufferAsT<T>() + tadOffsets[i];

@ -341,7 +341,7 @@ void prelu(nd4j::LaunchContext * context, const NDArray& input, const NDArray& a
    const Nd4jLong* alphaShapeInfo = alpha.getShapeInfo();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            // FIXME: double!
            double x = input.e<double>(i);
            if (x < 0.0) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_hue.cpp
@ -67,7 +67,7 @@ static void adjustHue_(const NDArray *input, const NDArray* deltaScalarArr, NDAr
        const Nd4jLong zDimCstride = output->stridesOf()[dimC];

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {

                const T *xTad = x + packX.platformOffsets()[i];
                T *zTad = z + packZ.platformOffsets()[i];
--- a/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/adjust_saturation.cpp
@ -66,7 +66,7 @@ static void adjustSaturation_(const NDArray *input, const NDArray* factorScalarA
        const Nd4jLong zDimCstride = output->stridesOf()[dimC];

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                const T *xTad = x + packX.platformOffsets()[i];
                T *zTad = z + packZ.platformOffsets()[i];

--- a/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batched_gemm.cpp
@ -94,7 +94,7 @@ void bgemm_(const std::vector<NDArray*>& vA, const std::vector<NDArray*>& vB, st
        int vaSize = vA.size();

        auto func = PRAGMA_THREADS_FOR {
-            for (auto p = start; p < stop; p += increment) {
+            for (auto p = start; p < stop; p++) {
                auto A = reinterpret_cast<T *>(vA.at(p)->buffer());
                auto B = reinterpret_cast<T *>(vB.at(p)->buffer());
                auto C = reinterpret_cast<T *>(vC.at(p)->buffer());
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@ -141,7 +141,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray

        Nd4jLong coords[MAX_RANK];

-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {

            shape::index2coords(i, input->getShapeInfo(), coords);

--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@ -117,7 +117,7 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 	int xLen = x.lengthOf();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
    };

--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
        auto func = PRAGMA_THREADS_FOR {
            T *col, *im;

-            for (uint b = start; b < stop; b += increment) {
+            for (uint b = start; b < stop; b++) {
                T *im0 = imBuff + b * imStride0;
                T *col4 = colBuff + b * colStride0;
                for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/concat.cpp
@ -0,0 +1,41 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com), created on 20.04.2018
+//
+
+
+#include <ops/declarable/helpers/transforms.h>
+#include <ops/specials.h>
+
+namespace nd4j {
+    namespace ops {
+        namespace helpers {
+            //////////////////////////////////////////////////////////////////////////
+            template<typename T>
+            static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+                nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
+            }
+
+            void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+                BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
+            }
+
+            BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);
+        }
+    }
+}
--- a/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/confusion.cpp
@ -32,7 +32,7 @@ namespace helpers {
        int lLen = labels->lengthOf();

        auto func = PRAGMA_THREADS_FOR {
-            for (int j = start; j < stop; j += increment) {
+            for (int j = start; j < stop; j++) {
                auto label = labels->e<Nd4jLong>(j);
                auto pred = predictions->e<Nd4jLong>(j);
                T value = (weights == nullptr ? (T) 1.0f : weights->e<T>(j));
--- a/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/crop_and_resize.hpp
@ -50,7 +50,7 @@ namespace nd4j {
                    T widthScale = (cropWidth > 1) ? (x2 - x1) * (imageWidth - 1) / (cropWidth - 1) : T(0);

                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto y = start; y < stop; y += increment) {
+                        for (auto y = start; y < stop; y++) {
                            const float inY = (cropHeight > 1)
                                              ? y1 * (imageHeight - 1) + y * heightScale
                                              : 0.5 * (y1 + y2) * (imageHeight - 1);
--- a/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/cross.cpp
@ -39,7 +39,7 @@ void crossBatched(nd4j::LaunchContext * context, NDArray *a, NDArray *b, NDArray
    int tads = tadsA.size();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            auto a_ = tadsA.at(e);
            auto b_ = tadsB.at(e);
            auto o_ = tadsO.at(e);
--- a/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/d_t_s.cpp
@ -46,7 +46,7 @@ namespace helpers {
        if (isNHWC) {
            const int total_count = batch_size * output_height * output_width * output_depth;
            auto func = PRAGMA_THREADS_FOR {
-                for (auto out_idx = start; out_idx < stop; out_idx += increment) {
+                for (auto out_idx = start; out_idx < stop; out_idx++) {
                    const int d = out_idx % output_depth;
                    const int out_idx2 = out_idx / output_depth;
                    const int w = out_idx2 % output_width;
@ -70,7 +70,7 @@ namespace helpers {
            const int total_count = batch_size * input_depth_by_input_area;

            auto func = PRAGMA_THREADS_FOR {
-                for (int input_idx = start; input_idx < stop; input_idx += increment) {
+                for (int input_idx = start; input_idx < stop; input_idx++) {
                    const int n_bY_bX_oC_iY = input_idx / input_width;
                    const int iX = input_idx - n_bY_bX_oC_iY * input_width;

--- a/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/diGamma.cpp
@ -32,7 +32,7 @@ template <typename T>
 static void diGamma_(const NDArray& x, NDArray& z) {

 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            z.p(i, diGammaScalar<T>(x.e<T>(i)));
    };
 	samediff::Threads::parallel_for(func, 0, x.lengthOf());
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@ -35,7 +35,7 @@ namespace helpers {
        int inLen = input->lengthOf();

        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                float val = nodeRng.relativeT<T>(e, T(0.f), T(1.f));

                if (val < probValue)
@ -130,7 +130,7 @@ namespace helpers {
        nd4j::graph::RandomGenerator nodeRng(3019L, seed);

        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                float randVal = nodeRng.relativeT(e, T(0.f), T(1.f));
                float xVal = input->e<float>(e);
                output->p<float>(e, randVal >= probValue ? alpha * beta + alpha1 : alpha * xVal + alpha1);
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@ -62,7 +62,7 @@ namespace nd4j {
                    unsigned int outSize = outputList.size();

                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
                            outputs[i].first = outputList[i];
                            outputs[i].second = 0;
                            for (int e = 0; e < indices->lengthOf(); ++e)
@ -168,7 +168,7 @@ namespace nd4j {
                    unsigned int gradsSize = inputGradientList.size();

                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {
                            outputs[i].first = inputGradientList[i];
                            outputs[i].second = 0;
                            for (int e = 0; e < indices->lengthOf(); ++e)
--- a/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/extract_patches.cpp
@ -50,7 +50,7 @@ namespace helpers {
            colCast = 0;

       auto func = PRAGMA_THREADS_FOR {
-           for (auto batch = 0; batch < stop; batch += increment) {
+           for (auto batch = 0; batch < stop; batch++) {
               auto patch = listOfMatricies.at(batch);
               auto outMatrix = listOfOutputs.at(batch);

--- a/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/gather.cpp
@ -59,7 +59,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
            if(input->rankOf() == 1 && output->rankOf() == 1) {

                auto func = PRAGMA_THREADS_FOR {
-                    for (auto i = start; i < stop; i += increment)
+                    for (auto i = start; i < stop; i++)
                        output->p(i, input->e(indices->e<Nd4jLong>(i)));
                };

@ -88,7 +88,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*

                    auto func = PRAGMA_THREADS_FOR {

-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {

                            void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
                            void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@ -100,7 +100,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*
                }
                else {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto i = start; i < stop; i += increment) {
+                        for (auto i = start; i < stop; i++) {

                            void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[indices->e<Nd4jLong>(i)]);
                            void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@ -140,7 +140,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*

                auto func = PRAGMA_THREADS_FOR {

-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {

                        void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
                        void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
@ -155,7 +155,7 @@ void gather(nd4j::LaunchContext * context, const NDArray* input, const NDArray*

                auto func = PRAGMA_THREADS_FOR {

-                    for (auto i = start; i < stop; i += increment) {
+                    for (auto i = start; i < stop; i++) {

                        void* inBuff  =  input->bufferWithOffset(inTadPack.primaryOffsets()[intArgs[i + 1]]);
                        void* outBuff = output->bufferWithOffset(outTadPack.primaryOffsets()[i]);
--- a/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hamming.cpp
@ -56,7 +56,7 @@ namespace nd4j {

                if (xEws == 1 && yEws == 1 && x.ordering() == y.ordering()) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto _x = static_cast<unsigned long long>(xBuffer[e]);
                            auto _y = static_cast<unsigned long long>(yBuffer[e]);

@ -67,7 +67,7 @@ namespace nd4j {
                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                } else if (xEws > 1 && yEws > 1 && x.ordering() == y.ordering()) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto _x = static_cast<unsigned long long>(xBuffer[e * xEws]);
                            auto _y = static_cast<unsigned long long>(yBuffer[e * yEws]);

@ -78,7 +78,7 @@ namespace nd4j {
                    maxThreads = samediff::Threads::parallel_for(func, 0, lengthOf);
                } else {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto _x = static_cast<unsigned long long>(x.e<Nd4jLong>(e));
                            auto _y = static_cast<unsigned long long>(y.e<Nd4jLong>(e));

--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@ -42,7 +42,7 @@ namespace nd4j {

                // we divide array into 32 element chunks, and store intermediate results once
                auto func = PRAGMA_THREADS_FOR {
-                    for (auto b = 0; b < stop; b += increment) {
+                    for (auto b = 0; b < stop; b++) {
                        auto blockBuffer = buffer + b * numBlocks;

                        Nd4jLong r = 1;
@ -64,7 +64,7 @@ namespace nd4j {


                    auto func2 = PRAGMA_THREADS_FOR {
-                        for (auto b = start; b < stop; b += increment) {
+                        for (auto b = start; b < stop; b++) {
                            auto blockBuffer = tempBuffer + b * numBlocks;

                            Nd4jLong r = 1;
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@ -280,7 +280,7 @@ namespace helpers {
        int xsSize = xs.size();
        // Scale x interpolation weights to avoid a multiplication during iteration.
        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                xs[i]._bottomIndex *= channels;
                xs[i]._topIndex *= channels;
            }
@ -906,7 +906,7 @@ namespace helpers {
        auto outputPtr = output->bufferAsT<float>(); // output is always float. TO DO: provide another float types also with  template <typename X, typename Z> declaration

        auto batchProcess = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                for (auto y = 0; y < st.outHeight; ++y) {
                    const float inY = y * st.heightScale;
                    const float inY1 = (y + 1) * st.heightScale;
@ -961,7 +961,7 @@ namespace helpers {
            if (Status::OK() == res) {
                std::vector<CachedInterpolation> xCached(st.outWidth);
                auto cachingProcedure = PRAGMA_THREADS_FOR {
-                    for (auto x = start; x < stop; x += increment) {
+                    for (auto x = start; x < stop; x++) {
                        auto &xCache = xCached[x];
                        const float inX = x * st.widthScale;
                        const float inX1 = (x + 1) * st.widthScale;
--- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
@ -39,7 +39,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
        'c' == output.ordering() && 1 == output.ews()){

        auto func = PRAGMA_THREADS_FOR{
-             for (auto i = start; i < stop; i += increment) {
+             for (auto i = start; i < stop; i++) {
                 const auto xStep = i*3;
                 z[i] = 0.2989f*x[xStep] + 0.5870f*x[xStep + 1] + 0.1140f*x[xStep + 2];
             }
@ -52,7 +52,7 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
    auto func = PRAGMA_THREADS_FOR{

         Nd4jLong coords[MAX_RANK];
-         for (auto i = start; i < stop; i += increment) {
+         for (auto i = start; i < stop; i++) {
             shape::index2coords(i, output.getShapeInfo(), coords);
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
             const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords);
@ -99,7 +99,7 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
    const Nd4jLong zDimCstride = output.stridesOf()[dimC];

    auto func = PRAGMA_THREADS_FOR{
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            const T* xTad = x + packX.platformOffsets()[i];
            T* zTad = z + packZ.platformOffsets()[i];
            op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@ -157,7 +157,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
        const Nd4jLong zDimCstride = output->stridesOf()[dimC];

        auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                const T* xTad = x + packX.platformOffsets()[i];
                T* zTad = z + packZ.platformOffsets()[i];
                op(xTad[0], xTad[xDimCstride], xTad[2 * xDimCstride], zTad[0], zTad[zDimCstride], zTad[2 * zDimCstride]);
@ -207,7 +207,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
        const Nd4jLong zDimCstride = output->stridesOf()[dimC];

        auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                const T* xTad = x + packX.platformOffsets()[i];
                T* zTad = z + packZ.platformOffsets()[i];
                //simple M*v //tr.T*v
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@ -146,7 +146,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
        int span = (tads / num_threads) + 8;

        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
+            for (auto r = start; r < stop; r++) {
                    auto rX = const_cast<NDArray*>(input)->bufferAsT<X>() + tadOffsets[r];
                    auto rZ = output->bufferAsT<Z>() + zOfsets[r];

--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
    if(inTadEws == 1 && outTadEws == 1) {
        
        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                const T *x = inBuff + inTadOffsets[i];
                T *y = outBuff + outTadOffsets[i];

@ -179,7 +179,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
    if(inTadEws == 1 && gradITadEws == 1) {
        
        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                const X *x = inBuff + inTadOffsets[i];
                      Y *y = gradIBuff + gradITadOffsets[i];

@ -247,7 +247,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
    else {

        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i += increment) {
+            for (uint i = start; i < stop; i++) {
                const X *x = inBuff + inTadOffsets[i];
                      Y *y = gradIBuff + gradITadOffsets[i];

--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
    auto h_ = h->bufferAsT<T>();

    auto func = PRAGMA_THREADS_FOR {
-        for (uint e = start; e < stop; e += increment) {
+        for (uint e = start; e < stop; e++) {
            c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
            h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
        }
--- a/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lup.cpp
@ -45,7 +45,7 @@ namespace helpers {
            auto n = shape::sizeAt(matrixShape, -1);

            auto loop = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    Nd4jLong theFirstPos[] = {theFirst, i};
                    Nd4jLong theSecondPos[] = {theSecond, i};
                    auto theFirstIndex = shape::getOffset(matrixShape, theFirstPos, 0);
@ -203,7 +203,7 @@ namespace helpers {
        auto result = -1;
        //auto loop = PRAGMA_THREADS_FOR {
            auto start = column, stop = rowNum, increment = 1;
-            for (auto rowCounter = start; rowCounter < stop; rowCounter += increment) {
+            for (auto rowCounter = start; rowCounter < stop; rowCounter++) {
                Nd4jLong xPos[] = {rowCounter, column};
                auto xIndex = shape::getOffset(compoundShape, xPos, 0);
                if (nd4j::math::nd4j_abs(compoundBuffer[xIndex]) > maxValue) {
@ -221,7 +221,7 @@ namespace helpers {
        Nd4jLong xDiag[] = {currentRow, currentRow};
        auto diagIndex = shape::getOffset(compoundShape, xDiag, 0);
        auto loop = PRAGMA_THREADS_FOR {
-            for (int j = start; j < stop; j += increment) {
+            for (auto j = start; j < stop; j++) {
                Nd4jLong xRow[] = {j, currentRow};
                auto rowIndex = shape::getOffset(compoundShape, xRow, 0);
                compoundBuf[rowIndex] /= compoundBuf[diagIndex]; //output->t<T>(i, i);
@ -310,7 +310,7 @@ namespace helpers {
            permutations = permutationVectors->allTensorsAlongDimension({-1});

        auto loop = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                luNN_<T, I>(context, outputs.at(i), permutationVectors?permutations.at(i):nullptr, n);
            }
        };
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_diag_part.cpp
@ -46,7 +46,7 @@ int _matrixDiagPart(const NDArray* input, NDArray* output) {
    int lO = listOut.size();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            for (int j = 0; j < lastDimension; ++j)
                listOut.at(i)->p(j, listDiag.at(i)->e<T>(j, j));
    };
--- a/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/nth_element.cpp
@ -55,7 +55,7 @@ namespace helpers {
            Nd4jLong oL = output->lengthOf();

            auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto row = rows.at(e);
                    output->p(e, row->e<T>(n));
                }
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@ -49,7 +49,7 @@ namespace nd4j {

                if (tadEws >= 1) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = 0; e < stop; e += increment) {
+                        for (auto e = 0; e < stop; e++) {
                            auto cO = output + tadPack.primaryOffsets()[e];

                            auto idx = static_cast<int>(indices[e]);
@ -70,7 +70,7 @@ namespace nd4j {
                    samediff::Threads::parallel_tad(func, 0, numTads);
                } else {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto cO = output + tadPack.primaryOffsets()[e];

                            auto idx = static_cast<int>(indices[e]);
--- a/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/polyGamma.cpp
@ -70,7 +70,7 @@ template <typename T>
 static void polyGamma_(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& output) {

 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
        	const T order = n.e<T>(i);
        	if(order != static_cast<int>(order))						// if order has fractional part then do not perform calculations and return NAN
        		output.p(i, std::numeric_limits<T>::quiet_NaN());
--- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
@ -113,7 +113,7 @@ namespace helpers {
        ResultSet listOutR(outputR->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
        ResultSet listInput(input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim}));
        auto batching = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                //qr here
                qrSingle<T>(listInput.at(batch), listOutQ.at(batch), listOutR.at(batch), fullMatricies);
            }
--- a/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/range.cpp
@ -39,7 +39,7 @@ static void _range(const NDArray& start, const NDArray& delta, NDArray& outVecto
    auto d = delta.e<T>(0);

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            buff[i] = s + i * d;
    };
    samediff::Threads::parallel_for(func, 0, len);
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@ -54,7 +54,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
            if (inArr == outArr) {
                if (inEWS == 1) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto idx = sLength - e;
                            swap(inArr, e, idx);
                        }
@ -63,7 +63,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                }
                else if (inEWS > 1) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto idx1 = (sLength - e) * inEWS;
                            Nd4jLong idx2 = e * inEWS;
                            swap(inArr, idx1, idx2);
@ -75,7 +75,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                else {

                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                            auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
                            swap(outArr, inOffset, outOffset);
@ -93,14 +93,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {

                    auto func = PRAGMA_THREADS_FOR {
-                        for (Nd4jLong e = start; e < stop; e += increment)
+                        for (Nd4jLong e = start; e < stop; e++)
                            outArr[sLength - e] = inArr[e];
                    };
                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);

                    if(inLength != numOfElemsToReverse) {
                        auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment)
+                            for (auto e = start; e < stop; e++)
                                outArr[e] = inArr[e];
                        };
                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@ -109,14 +109,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {

                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment)
+                        for (auto e = start; e < stop; e++)
                            outArr[(sLength - e) * outEWS] = inArr[e * inEWS];
                    };
                    samediff::Threads::parallel_for(func, 0, numOfElemsToReverse);

                    if(inLength != numOfElemsToReverse) {
                        auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment)
+                            for (auto e = start; e < stop; e++)
                                outArr[e * outEWS] = inArr[e * inEWS];
                        };
                        samediff::Threads::parallel_for(f2, numOfElemsToReverse, inLength);
@ -125,7 +125,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                else {

                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                            auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
                            outArr[outOffset] = inArr[inOffset];
@ -136,7 +136,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                    if(inLength != numOfElemsToReverse) {

                        auto f2 = PRAGMA_THREADS_FOR {
-                            for (auto e = start; e < stop; e += increment) {
+                            for (auto e = start; e < stop; e++) {
                                auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
                                auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
                                outArr[outOffset] = inArr[inOffset];
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@ -114,7 +114,7 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
    // loop through input array
    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {

            shape::index2coords(i, output.getShapeInfo(), coords);

@ -300,7 +300,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
    // loop through output array
    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            shape::index2coords(i, output.getShapeInfo(), coords);

            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_d.cpp
@ -48,7 +48,7 @@ namespace helpers {
            const int total_count = batch_size * input_height * input_width * input_depth;

            auto func = PRAGMA_THREADS_FOR {
-                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
                    // inp_idx = d + input_depth * (w + input_width * (h + input_height * b))
                    const int d = inp_idx % input_depth;
                    const int inp_idx2 = inp_idx / input_depth;
@ -74,7 +74,7 @@ namespace helpers {
            const int total_count = batch_size * output_depth_by_output_area;

            auto func = PRAGMA_THREADS_FOR {
-                for (auto inp_idx = start; inp_idx < stop; inp_idx += increment) {
+                for (auto inp_idx = start; inp_idx < stop; inp_idx++) {
                    const int n_iC_oY_bY_oX = inp_idx / block_size;
                    const int bX = inp_idx - n_iC_oY_bY_oX * block_size;

--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@ -45,7 +45,7 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int

        Nd4jLong xCoords[MAX_RANK];

-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {

            shape::index2coords(i, xShapeInfo, xCoords);

@ -79,7 +79,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind

    if(outRank == 1) {
        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                Nd4jLong idx = indices.e<Nd4jLong>(i);
                NDArray out = output({idx, idx + 1});

@ -99,7 +99,7 @@ void scatter(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& ind
        std::iota(dimsToExcludeUpd.begin(), dimsToExcludeUpd.end(), 0);

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                NDArray outSubArr = output(indices.e<Nd4jLong>(i), std::vector<int>({0}));
                NDArray updSubArr = updates(i, dimsToExcludeUpd);

@ -121,7 +121,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i

    if(outRank == 1) {
        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                Nd4jLong idx = indices.e<Nd4jLong>(i);
                NDArray out = output({idx, idx + 1});

@ -139,7 +139,7 @@ void scatterND(nd4j::LaunchContext  *context, pairwise::Ops op, const NDArray& i
        auto func = PRAGMA_THREADS_FOR {
            std::vector<Nd4jLong> idxRangeOut(2*outRank, 0);

-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                NDArray indSubArr = indices(i, dimsToExcludeInd);

                for (Nd4jLong j = 0; j < indLastDim; ++j) {
@ -170,7 +170,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr

    if(!calcGrad) {
        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto subArr = updates(i, dimsToExclude);
                output.p(i, subArr.e(indices.e<Nd4jLong>(i)));
            }
@ -179,7 +179,7 @@ void scatterForLoss(nd4j::LaunchContext  *context, const NDArray& indices, NDArr
        samediff::Threads::parallel_for(func, 0, indicesLen);
    } else {
        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto subArr = updates(i, dimsToExclude);
                auto ind = indices.e<Nd4jLong>(i);
                subArr.p(ind, subArr.e(ind) - 1.);
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@ -169,7 +169,7 @@ namespace helpers {
            for (int i = 1; i < indices->lengthOf(); i++) {
                if (indices->e<int>(i) == idx) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            meanV.p<T>(e, meanV.e<T>(e) + listOfTensors.at(i)->e<T>(e));
                        }
                    };
@ -223,7 +223,7 @@ namespace helpers {
            for (int i = 0; i < indices->lengthOf(); i++) {
                if (indices->e<int>(i) == idx) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            sumT->p(e, sumT->e<T>(e) + listOfTensors.at(i)->e<T>(e));
                        }
                    };
@ -272,7 +272,7 @@ namespace helpers {
            for (int i = 1; i < indices->lengthOf(); i++) {
                if (indices->e<int>(i)  == idx) {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto e = start; e < stop; e += increment) {
+                        for (auto e = start; e < stop; e++) {
                            sumT->p(e, sumT->e<T>(e) * listOfTensors.at(i)->e<T>(e));
                        }
                    };
@ -625,7 +625,7 @@ namespace helpers {
            Nd4jLong loop_size = input->lengthOf();

            auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto classNum = indices->e<Nd4jLong>(e);
                    if (nd4j::math::nd4j_abs(tempRes.e<T>(classNum) - input->e<T>(e)) <= T(1.e-6))
                        output->p(e, gradOut->e<T>(classNum));
@ -645,7 +645,7 @@ namespace helpers {
            //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
@ -675,7 +675,7 @@ namespace helpers {
        segmentMinFunctor(context, input, indices, &tempRes);
        if (input->isVector()) {
            auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto classNum = indices->e<Nd4jLong>(e);
                    if (nd4j::math::nd4j_abs(tempRes.e<double>(classNum) - input->e<double>(e)) < 1.e-5)
                        output->p(e, gradOut->e<double>(classNum));
@ -697,7 +697,7 @@ namespace helpers {
            int pos = 0;

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
@ -887,7 +887,7 @@ namespace helpers {
        if (input->isVector()) {

            auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto classNum = indices->e<Nd4jLong>(e);
                    if (nd4j::math::nd4j_abs(tempRes.t<T>(classNum) - input->t<T>(e)) < 1.e-6)
                        output->t<T>(e) = gradOut->t<T>(classNum);
@ -1004,7 +1004,7 @@ namespace helpers {
        unsortedSegmentProdFunctor(context, input, indices, numOfClasses, &tempRes);
        if (input->isVector()) {
            auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto classNum = indices->e<Nd4jLong>(e);
                    output->p<double>(e, gradOut->e<double>(classNum) * tempRes.e<double>(classNum) / input->e<double>(e));
                }
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@ -364,7 +364,7 @@ namespace nd4j {
                    auto func = PRAGMA_THREADS_FOR {
                        T sneu1e[600];

-                        for (auto t = start; t < stop; t += increment) {
+                        for (auto t = start; t < stop; t++) {
                            T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
                            memset(neu1e, 0, vectorLength * sizeof(T));

@ -457,7 +457,7 @@ namespace nd4j {
                    T sneu1[600];
                    T sneu1e[600];

-                    for (int e = start; e < stop; e += increment) {
+                    for (int e = start; e < stop; e++) {
                        T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
                        T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];

--- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
@ -40,7 +40,7 @@ namespace helpers {
        output->assign(input);

        auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                for (auto r = 0; r < rows; r++) {
                    for (auto c = 0; c < r; c++) {
                        math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
--- a/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sru.cpp
@ -143,7 +143,7 @@ static void sruBI_(NDArray* x, const NDArray* w, const NDArray* b, const NDArray
    T* pCt   = ct->bufferAsT<T>();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto col = start; col < stop; col += increment) {
+        for (auto col = start; col < stop; col++) {
            const auto colNum = col % d2;
            bool flip = colNum >= K;
            T maskVal = mask ? *(pMask + col) : T(1);
@ -236,7 +236,7 @@ static void sruBIBP_(NDArray* x, const NDArray* w, const NDArray* b, const NDArr
    T* pGradInit  = gradC0->bufferAsT<T>();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto col = start; col < stop; col += increment) {
+        for (auto col = start; col < stop; col++) {
            T gbF = 0.f;
            T gbR = 0.f;
            const auto colNum = col % d2;
--- a/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/stack.cpp
@ -37,7 +37,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
 	    int inSize = inArrs.size();

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment)
+            for (auto i = start; i < stop; i++)
                outArr->p<T>(i, inArrs[i]->t<T>(0));
        };

@ -50,7 +50,7 @@ static void stack_(const std::vector<const NDArray*>& inArrs, NDArray* outArr, c
        int listSize = list.size();

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment)
+            for (auto i = start; i < stop; i++)
                list.at(i)->assign(inArrs[i]);
        };
        samediff::Threads::parallel_tad(func, 0, listSize);
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@ -150,7 +150,7 @@ namespace helpers {
            result->assign(0);
            if (status == ND4J_STATUS_OK) {
                auto func = PRAGMA_THREADS_FOR {
-                    for (auto e = start; e < stop; e += increment) {
+                    for (auto e = start; e < stop; e++) {
                        bool found = false;
                        for (int j = 0; j < k; j++) {
                            if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@ -43,7 +43,7 @@ static void triuBP_(nd4j::LaunchContext * context, const NDArray& input, const N
    int dLen = dOdI.lengthOf();

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            if (dOdI.t<T>(i) != static_cast<T>(0.f))
                dOdI.t<T>(i) = static_cast<T>(1.f);
        }
@ -65,7 +65,7 @@ static void trace_(const NDArray& input, NDArray& output) {
    auto setOfSubArrs = input.allTensorsAlongDimension({inRank-2, inRank-1});

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            output.p(i, setOfSubArrs.at(i)->getTrace());
    };
    samediff::Threads::parallel_for(func, 0, setOfSubArrs.size());
@ -189,7 +189,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray

        auto func = PRAGMA_THREADS_FOR {
            Nd4jLong coords[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                shape::index2coords(i, output.getShapeInfo(), coords);
                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);

@ -220,7 +220,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray

        auto func = PRAGMA_THREADS_FOR {
            Nd4jLong coords[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                shape::index2coords(i, output.getShapeInfo(), coords);
                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);

@ -566,7 +566,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {

    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK * 3];
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            Nd4jLong *zCoordStart, *xCoordStart;

            if (yLastDim == xRank) {
@ -650,7 +650,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
        else if (input->rankOf() == 1 && indices->isVector()) {
            // special case
            auto func = PRAGMA_THREADS_FOR {
-                for (auto e = start; e < stop; e += increment)
+                for (auto e = start; e < stop; e++)
                    output->p(e, input->e<T>(indices->e<Nd4jLong>(e)));
            };

@ -663,7 +663,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
            const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), dimsOut);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    NDArray subArrOut = (*output)(i, dimsOut);
                    NDArray subArrIn = (*input)(indices->e<Nd4jLong>(i), {axis});
                    subArrOut.assign(subArrIn);
@ -687,7 +687,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
            const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(output->getShapeInfo(), {axis});

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    NDArray subArrOut = (*output)(i, {axis});
                    NDArray subArrIn = (*input)(intArgs[i + 1], {axis});
                    subArrOut.assign(subArrIn);
@ -710,7 +710,7 @@ void eye(nd4j::LaunchContext * context, NDArray& output) {
    auto arrs = output.allTensorsAlongDimension({rank-2, rank-1});

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            arrs.at(i)->setIdentity();
    };

@ -737,7 +737,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
        indices.push_back((*intArgs)[e]);

    auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment) {
+        for (auto i = start; i < stop; i++) {
            auto inSubArr = input(indices[i], dimsToExclude, true);
            auto updSubArr = updates(i, dimsToExclude, true);

@ -786,7 +786,7 @@ void scatterSimple(nd4j::LaunchContext * context, const int opId, NDArray& input

        case 6: {   // copy
            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto inSubArr = input(i, dimensions);
                    inSubArr.p(indices.t<Nd4jLong>(i), updates.e(i));
                }
@ -809,7 +809,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
    auto x = inArrs[0];

    auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            T max = -DataTypeUtils::max<T>();
            Nd4jLong idx = 0;

@ -839,7 +839,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
    auto x = inArrs[0];

    auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            T max = -DataTypeUtils::max<T>();
            for (int i = 0; i < numArgs; i++) {
                T v = inArrs[i]->e<T>(e);
@ -865,7 +865,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
    auto x = inArrs[0];

    auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            T sum = 0.;
            for (int i = 0; i < numArgs; i++) {
                T v = inArrs[i]->e<T>(e);
@ -891,7 +891,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
    auto x = inArrs[0];

    auto func = PRAGMA_THREADS_FOR {
-        for (auto e = start; e < stop; e += increment) {
+        for (auto e = start; e < stop; e++) {
            T sum = (T) 0.f;
            for (int i = 0; i < numArgs; i++)
                sum += inArrs[i]->e<T>(e);
@ -928,7 +928,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
            auto listOfInSubArrs = input.allTensorsAlongDimension(dimensions);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    const T iNormActual = norm2.e<T>(i);
                    if (iNormActual > normClip)
                        *listOfInSubArrs.at(i) *= normClip / iNormActual;
@ -952,7 +952,7 @@ static void clipByNorm_(NDArray& input, NDArray& output, const std::vector<int>&
            auto listOfOutSubArrs = output.allTensorsAlongDimension(dimensions);

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    auto inputSubArr = listOfInSubArrs.at(i);
                    auto outputSubArr = listOfOutSubArrs.at(i);
                    outputSubArr->assign(inputSubArr);
@ -1058,7 +1058,7 @@ static void clipByNormBP_(const NDArray& input, const NDArray& gradO, NDArray& g
        auto cn = clipNorm.e<T>(0);

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                T N = norm2.e<T>(i);

                auto gradOSubArr = gradOSubArrs.at(i);
@ -1190,7 +1190,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
        auto func = PRAGMA_THREADS_FOR {
            Nd4jLong inIdx[MAX_RANK];
            Nd4jLong outIdx[MAX_RANK];
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                shape::index2coords(i, output.getShapeInfo(), outIdx);

                for (int j = 0; j < rank; ++j) {
@ -1225,17 +1225,6 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o

    BUILD_SINGLE_TEMPLATE(template void mirrorPad_, (const NDArray& input, const NDArray& paddings, NDArray& output, const int mode), LIBND4J_TYPES);

-//////////////////////////////////////////////////////////////////////////
-template<typename T>
-static void concat_(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-    nd4j::SpecialMethods<T>::concatCpuGeneric(inArrs, output, axis);
-}
-
-    void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-        BUILD_SINGLE_SELECTOR(output.dataType(), concat_,(inArrs, output, axis), LIBND4J_TYPES);
-    }
-
-    BUILD_SINGLE_TEMPLATE(template void concat_, (const std::vector<NDArray*>& inArrs, NDArray& output, const int axis), LIBND4J_TYPES);

 //////////////////////////////////////////////////////////////////////////
 template <typename T>
--- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
@ -90,7 +90,7 @@ namespace helpers {
        auto outputPart = output->allTensorsAlongDimension({-2, -1});

        auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                if (lower) {
                    lowerTriangularSolve<T>(context, leftPart[i], rightPart[i], adjoint, outputPart[i]);
                } else {
@ -112,7 +112,7 @@ namespace helpers {
        auto rows = input->sizeAt(-2);

        auto batchLoop = PRAGMA_THREADS_FOR {
-            for (auto batch = start; batch < stop; batch += increment) {
+            for (auto batch = start; batch < stop; batch++) {
                if (!lower) {
                    for (auto r = 0; r < rows; r++) {
                        for (auto c = 0; c <= r; c++) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/zeta.cpp
@ -64,7 +64,7 @@ static void zeta_(nd4j::LaunchContext * context, const NDArray& x, const NDArray
 	int xLen = x.lengthOf();

 	auto func = PRAGMA_THREADS_FOR {
-        for (auto i = start; i < stop; i += increment)
+        for (auto i = start; i < stop; i++)
            z.p(i, zetaScalar<T>(x.e<T>(i), q.e<T>(i)));
    };

--- a/libnd4j/include/ops/declarable/helpers/cross.h
+++ b/libnd4j/include/ops/declarable/helpers/cross.h
@ -68,7 +68,7 @@ void FORCEINLINE cross(nd4j::LaunchContext * context, NDArray *a, NDArray *b, ND
        int tads = tadsA.size();

        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                auto a_ = tadsA.at(e);
                auto b_ = tadsB.at(e);
                auto o_ = tadsO.at(e);
--- a/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/unique.cpp
@ -69,7 +69,7 @@ namespace helpers {
        }

        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                values->p(e, static_cast<T>(valuesVector[e]));
                if (counts != nullptr)
                    counts->p(e, countsMap[valuesVector[e]]);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_0.cpp
@ -19,8 +19,10 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_0);
+
+    BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
 }
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_1.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_1);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_2.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_2);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_3.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_3);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_4.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_4);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_5.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_5);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_6.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_6);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_7.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_7);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_8.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_8);
--- a/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_double_9.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_double.hpp"

 namespace nd4j {
    BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES_9);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_0.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_0);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_1.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_1);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_2.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_2);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_3.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_3);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_4.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_4);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_5.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_5);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_6.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_6);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_7.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_7);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_8.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_8);
--- a/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
+++ b/libnd4j/include/ops/impl/compilation_units/specials_single_9.cpp
@ -19,7 +19,7 @@
 // @author raver119@gmail.com
 //

-#include "../specials.hpp"
+#include "../specials_single.hpp"

 namespace nd4j {
    BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES_9);
--- a/libnd4j/include/ops/impl/gemm.cpp
+++ b/libnd4j/include/ops/impl/gemm.cpp
@ -34,7 +34,7 @@ namespace nd4j {

            // handle transpose in parallel
            auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
                    for (int c = 0; c < cols; c++) {
                        int zIdx = orderTarget == CblasRowMajor ? linearIndexC(rows, cols, r, c) : linearIndexF(rows, cols, r, c);
                        int xIdx = orderSource == CblasColMajor ? linearIndexF(rows, cols, r, c) : linearIndexC(rows, cols, r, c);
@ -73,7 +73,7 @@ namespace nd4j {
                        C[r] = z;
                } else {
                    auto func = PRAGMA_THREADS_FOR {
-                        for (auto r = start; r < stop; r += increment)
+                        for (auto r = start; r < stop; r++)
                            C[r] = z;
                    };
                    samediff::Threads::parallel_for(func, 0, length);
@ -130,7 +130,7 @@ namespace nd4j {
            auto aT = TRANS == CblasTrans ? reinterpret_cast<X *>(nd4j::blas::transpose<X>(CblasColMajor, CblasRowMajor, M, N, reinterpret_cast<void *>(x))) : x;

            auto func = PRAGMA_THREADS_FOR {
-                for (auto r = start; r < stop; r += increment) {
+                for (auto r = start; r < stop; r++) {
                    int aIdx = linearIndexC(M, N, r, 0);
                    auto aX = aT + aIdx;

--- a/libnd4j/include/ops/impl/specials_double.hpp
+++ b/libnd4j/include/ops/impl/specials_double.hpp
@ -0,0 +1,270 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author raver119@gmail.com, created on 07.10.2017.
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include <pointercast.h>
+#include <helpers/shape.h>
+#include <helpers/TAD.h>
+#include <specials.h>
+#include <dll.h>
+#include <NDArray.h>
+#include <ops/declarable/CustomOperations.h>
+#include <types/types.h>
+#include <helpers/Loops.h>
+
+namespace nd4j {
+
+
+    template<typename S, typename T>
+    void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
+        auto x = reinterpret_cast<S *>(dx);
+        auto z = reinterpret_cast<T *>(dz);
+
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto i = start; i < stop; i++) {
+                z[i] = static_cast<T>(x[i]);
+            }
+        };
+
+        samediff::Threads::parallel_for(func, 0, N);
+    };
+
+
+    template <typename X, typename Y>
+    void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
+        int i = left, j = right;
+        X ktmp;
+        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
+
+        Y vtmp;
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
+                        i++;
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
+                        i++;
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+
+    template <typename X, typename Y>
+    void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
+        int i = left, j = right;
+        X ktmp;
+        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
+
+        Y vtmp;
+
+        {
+            /* PARTITION PART */
+            while (i <= j) {
+                if (descending) {
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
+                        i++;
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                } else {
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
+                        i++;
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
+                        j--;
+                    if (i <= j) {
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
+
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
+
+                        i++;
+                        j--;
+                    }
+                }
+            }
+
+        }
+
+        //
+
+        if ( ((right-left)<cutoff) ){
+            if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
+            if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
+
+        }else{
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
+PRAGMA_OMP_TASK
+            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
+        }
+    }
+
+
+    template <typename X, typename Y>
+    static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<X *>(varray);
+        auto values = reinterpret_cast<Y *>(yarray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+    }
+
+    template <typename X, typename Y>
+    static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
+        auto array = reinterpret_cast<X *>(varray);
+        auto values = reinterpret_cast<Y *>(yarray);
+        int cutoff = 1000;
+
+        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
+        {
+PRAGMA_OMP_SINGLE_ARGS(nowait)
+            {
+                quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
+            }
+        }
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
+        quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
+        quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
+        auto x = reinterpret_cast<X*>(vx);
+        auto y = reinterpret_cast<Y*>(vy);
+
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+
+        auto xLength = shape::length(xShapeInfo);
+        auto xTadLength = shape::length(packX.primaryShapeInfo());
+        auto numTads = packX.numberOfTads();
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
+
+                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+
+    template <typename X, typename Y>
+    void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
+        auto x = reinterpret_cast<X*>(vx);
+        auto y = reinterpret_cast<Y*>(vy);
+
+        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
+        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
+
+        auto xLength = shape::length(xShapeInfo);
+        auto xTadLength = shape::length(packX.primaryShapeInfo());
+        auto numTads = packX.numberOfTads();
+
+        auto func = PRAGMA_THREADS_FOR {
+            for (auto r = start; r < stop; r++) {
+                auto dx = x + packX.primaryOffsets()[r];
+                auto dy = y + packY.primaryOffsets()[r];
+
+                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
+            }
+        };
+
+        samediff::Threads::parallel_tad(func, 0, numTads);
+    }
+}
+
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@ -64,7 +64,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
                T* outBuff = output.bufferAsT<T>();

                auto func = PRAGMA_THREADS_FOR {
-                    for (auto r = start; r < stop; r += increment) {
+                    for (auto r = start; r < stop; r++) {
                        const Nd4jLong arrLen = inArrs[r]->lengthOf();
                        const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];

@ -99,7 +99,7 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, ND
        }

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                auto temp = output(indices[i], true);
                nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
            }
@ -143,7 +143,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
        auto x = reinterpret_cast<T **>(vx);

        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
+            for (auto i = start; i < stop; i++) {
                for (auto ar = 0L; ar < n; ar++) {
                    z[i] += x[ar][i];
                }
@ -179,7 +179,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
            }

            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    for (Nd4jLong ar = 1; ar < n; ar++) {
                        z[i] += x[ar][i] / static_cast<T>(n);
                    }
@ -199,7 +199,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint

            // aggregation step
            auto func = PRAGMA_THREADS_FOR {
-                for (auto i = start; i < stop; i += increment) {
+                for (auto i = start; i < stop; i++) {
                    for (Nd4jLong ar = 0; ar < n; ar++) {
                        z[i] += x[ar][i] / static_cast<T>(n);
                    }
@ -336,7 +336,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
        int numTads = xLength / xTadLength;

        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
+            for (auto r = start; r < stop; r++) {
                T *dx = x + tadOffsets[r];

                quickSort_parallel(dx, tadShapeInfo, xTadLength, 1, descending);
@ -358,7 +358,7 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)


        auto func = PRAGMA_THREADS_FOR {
-            for (auto e = start; e < stop; e += increment) {
+            for (auto e = start; e < stop; e++) {
                for (int bitId = 0; bitId < 16; bitId++) {
                    bool hasBit = (x[e] & 1 << (bitId)) != 0;
                    bool hasSign = (x[e] & 1 << (bitId + 16)) != 0;
@ -378,22 +378,6 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
        samediff::Threads::parallel_for(func, 4, lim);
    }

-    template<typename S, typename T>
-    void SpecialTypeConverter::convertGeneric(Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz) {
-        auto x = reinterpret_cast<S *>(dx);
-        auto z = reinterpret_cast<T *>(dz);
-
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i += increment) {
-                z[i] = static_cast<T>(x[i]);
-            }
-        };
-
-        samediff::Threads::parallel_for(func, 0, N);
-    };
-    BUILD_DOUBLE_TEMPLATE(template void SpecialTypeConverter::convertGeneric, (Nd4jPointer * extras, void *dx, Nd4jLong N, void *dz), LIBND4J_TYPES, LIBND4J_TYPES);
-
    template<typename T>
    Nd4jLong SpecialMethods<T>::encodeBitmapGeneric(void *vx, Nd4jLong *xShapeInfo, Nd4jLong N, int *dz, float threshold) {
        auto dx = reinterpret_cast<T *>(vx);
@ -442,226 +426,5 @@ PRAGMA_OMP_SINGLE_ARGS(nowait)
        };
        return samediff::Threads::parallel_long(func, LAMBDA_SUML, 0, N, 16);
    }
-
-    template <typename X, typename Y>
-    void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        int i = left, j = right;
-        X ktmp;
-        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
-
-        Y vtmp;
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
-                        i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
-                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
-                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
-                        i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
-                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
-                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_key(key, xShapeInfo, values, yShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-
-    template <typename X, typename Y>
-    void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        int i = left, j = right;
-        X ktmp;
-        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
-
-        Y vtmp;
-
-        {
-            /* PARTITION PART */
-            while (i <= j) {
-                if (descending) {
-                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
-                        i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
-                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
-                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                } else {
-                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
-                        i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
-                        j--;
-                    if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
-                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
-                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
-
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
-                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
-                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
-
-                        i++;
-                        j--;
-                    }
-                }
-            }
-
-        }
-
-        //
-
-        if ( ((right-left)<cutoff) ){
-            if (left < j){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-            if (i < right){ quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
-
-        }else{
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, left, j, cutoff, descending); }
-PRAGMA_OMP_TASK
-            { quickSort_parallel_internal_value(key, xShapeInfo, value, yShapeInfo, i, right, cutoff, descending); }
-        }
-    }
-
-
-    template <typename X, typename Y>
-    static void quickSort_parallel_key(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<X *>(varray);
-        auto values = reinterpret_cast<Y *>(yarray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal_key(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-    }
-
-    template <typename X, typename Y>
-    static void quickSort_parallel_value(void *varray, Nd4jLong *xShapeInfo, void *yarray, Nd4jLong *yShapeInfo, Nd4jLong lenArray, int numThreads, bool descending){
-        auto array = reinterpret_cast<X *>(varray);
-        auto values = reinterpret_cast<Y *>(yarray);
-        int cutoff = 1000;
-
-        PRAGMA_OMP_PARALLEL_THREADS(numThreads)
-        {
-PRAGMA_OMP_SINGLE_ARGS(nowait)
-            {
-                quickSort_parallel_internal_value(array, xShapeInfo, values, yShapeInfo, 0, lenArray-1, cutoff, descending);
-            }
-        }
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
-        quickSort_parallel_key<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, bool descending) {
-        quickSort_parallel_value<X,Y>(vx, xShapeInfo, vy, yShapeInfo, shape::length(xShapeInfo), omp_get_max_threads(), descending);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortTadByKey(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
-        auto x = reinterpret_cast<X*>(vx);
-        auto y = reinterpret_cast<Y*>(vy);
-
-        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
-
-        auto xLength = shape::length(xShapeInfo);
-        auto xTadLength = shape::length(packX.primaryShapeInfo());
-        auto numTads = packX.numberOfTads();
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                auto dx = x + packX.primaryOffsets()[r];
-                auto dy = y + packY.primaryOffsets()[r];
-
-                quickSort_parallel_key<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-    template <typename X, typename Y>
-    void DoubleMethods<X,Y>::sortTadByValue(void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, int *dimension, int dimensionLength, bool descending) {
-        auto x = reinterpret_cast<X*>(vx);
-        auto y = reinterpret_cast<Y*>(vy);
-
-        auto packX = ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dimension, dimensionLength);
-        auto packY = ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dimension, dimensionLength);
-
-        auto xLength = shape::length(xShapeInfo);
-        auto xTadLength = shape::length(packX.primaryShapeInfo());
-        auto numTads = packX.numberOfTads();
-
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto r = start; r < stop; r += increment) {
-                auto dx = x + packX.primaryOffsets()[r];
-                auto dy = y + packY.primaryOffsets()[r];
-
-                quickSort_parallel_value<X, Y>(dx, packX.primaryShapeInfo(), dy, packY.primaryShapeInfo(), xTadLength, 1, descending);
-            }
-        };
-
-        samediff::Threads::parallel_tad(func, 0, numTads);
-    }
-
-    //BUILD_SINGLE_TEMPLATE(template class SpecialMethods, , LIBND4J_TYPES);
-    //BUILD_DOUBLE_TEMPLATE(template class DoubleMethods, , LIBND4J_TYPES, LIBND4J_TYPES);
 }

--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@ -167,7 +167,7 @@ namespace randomOps {

            if (zEWS >= 1 && xEWS >= 1 && yEWS >= 1) {
                auto func = PRAGMA_THREADS_FOR {
-                    for (uint64_t e = start; e < stop; e += increment) {
+                    for (auto e = start; e < stop; e++) {
                        T prob = rng->relativeT<T>(e);
                        T cumProb = (T) 0.0f;
                        for (Nd4jLong f = 0; f < yLength; f++) {
@ -330,7 +330,7 @@ namespace randomOps {
            const T epsilon = static_cast<T>(1e-5);

            auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto epm = e + middle;

                    // we need to get random values
@ -440,7 +440,7 @@ namespace randomOps {

            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
            auto func = PRAGMA_THREADS_FOR {
-                for (Nd4jLong e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {

                    int success = 0;
                    for (int t = 1; t <= trials; t++) {
@ -549,7 +549,7 @@ namespace randomOps {
            //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
            nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
            auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {

                    int success = 0;
                    for (int t = 1; t <= trials; t++) {
@ -690,7 +690,7 @@ namespace randomOps {
            const T epsilon = static_cast<T>(1e-5);

            auto func = PRAGMA_THREADS_FOR {
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    if (z[e] > mean + ds || z[e] < mean - ds) {
                        z[e] = step(rng, mean, stddev, e, middle, z[e]);

@ -818,7 +818,7 @@ namespace randomOps {

            auto func = PRAGMA_THREADS_FOR {
                PRAGMA_OMP_SIMD
-                for (uint64_t e = start; e < stop; e += increment) {
+                for (auto e = start; e < stop; e++) {
                    auto epm = e + middle;

                    // we need to get random values