From b4575d11e97dd847dfd7cb8b2255ee3386faaa93 Mon Sep 17 00:00:00 2001
From: Oleh <oleg.semeniv@gmail.com>
Date: Wed, 26 Feb 2020 20:12:19 +0200
Subject: [PATCH] Loops auto-vectorization problem fix (#274)

* libnd4j cast loop types

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j more type castination added to loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j sync casting types of iterated variable in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j more loops reviewed for vectorization problem fix

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j fixed several typos

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j several more files reviewed to fix auto-vectorization problem in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j merge master and reviewed more files to fix auto-vectorization problem in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j several type casting added in broadcasting that were missed, fixed mac builds

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j double check all files and fix several more places in loops

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j fixed builds

Signed-off-by: Oleg <oleg.semeniv@gmail.com>

* libnd4j revert changes for lup.cpp

Signed-off-by: Oleg <oleg.semeniv@gmail.com>
---
 libnd4j/include/helpers/Loops.h               | 1687 +++++++++--------
 .../include/loops/cpu/TrueBroadcastHelper.hpp |   16 +-
 libnd4j/include/loops/cpu/broadcasting.hpp    |   48 +-
 .../include/loops/cpu/broadcasting_bool.cpp   |   20 +-
 .../include/loops/cpu/broadcasting_int.cpp    |   20 +-
 libnd4j/include/loops/cpu/indexreduce.hpp     |    2 +-
 libnd4j/include/loops/cpu/random.hpp          |   14 +-
 .../include/loops/cpu/reduce/reduce_bool.cpp  |    8 +-
 .../include/loops/cpu/reduce/reduce_float.hpp |    6 +-
 .../include/loops/cpu/reduce/reduce_long.cpp  |    6 +-
 .../include/loops/cpu/reduce/reduce_same.cpp  |    6 +-
 libnd4j/include/loops/cpu/reduce3.hpp         |    2 +-
 libnd4j/include/loops/cpu/scalar.hpp          |    4 +-
 libnd4j/include/loops/cpu/scalar_bool.cpp     |    4 +-
 libnd4j/include/loops/cpu/scalar_int.cpp      |    4 +-
 .../include/loops/cpu/summarystatsreduce.cpp  |   10 +-
 .../ops/declarable/generic/nn/batchnorm.cpp   |    8 +-
 .../declarable/helpers/cpu/BarnesHutTsne.cpp  |    6 +-
 .../declarable/helpers/cpu/activations.cpp    |   12 +-
 .../ops/declarable/helpers/cpu/addBias.cpp    |    4 +-
 .../ops/declarable/helpers/cpu/axis.cpp       |    4 +-
 .../ops/declarable/helpers/cpu/batchnorm.cpp  |    4 +-
 .../ops/declarable/helpers/cpu/col2im.cpp     |    6 +-
 .../ops/declarable/helpers/cpu/dilation2d.cpp |    8 +-
 .../ops/declarable/helpers/cpu/dropout.cpp    |    2 +-
 .../ops/declarable/helpers/cpu/dynamic.cpp    |   18 +-
 .../ops/declarable/helpers/cpu/flatten.cpp    |    2 +-
 .../ops/declarable/helpers/cpu/hashcode.cpp   |    9 +-
 .../ops/declarable/helpers/cpu/histogram.cpp  |    2 +-
 .../ops/declarable/helpers/cpu/im2col.cpp     |    8 +-
 .../declarable/helpers/cpu/image_resize.cpp   |   20 +-
 .../ops/declarable/helpers/cpu/lrn.cpp        |   20 +-
 .../ops/declarable/helpers/cpu/lstm.cpp       |    2 +-
 .../declarable/helpers/cpu/matrix_band.cpp    |    2 +-
 .../ops/declarable/helpers/cpu/prefix.cpp     |    4 +-
 .../include/ops/declarable/helpers/cpu/qr.cpp |    8 +-
 .../ops/declarable/helpers/cpu/random.cpp     |   12 +-
 .../declarable/helpers/cpu/random_crop.cpp    |    2 +-
 .../ops/declarable/helpers/cpu/roll.cpp       |    2 +-
 .../ops/declarable/helpers/cpu/s_t_b.cpp      |   16 +-
 .../ops/declarable/helpers/cpu/segment.cpp    |   58 +-
 .../ops/declarable/helpers/cpu/sg_cb.cpp      |    6 +-
 .../ops/declarable/helpers/cpu/solve.cpp      |    8 +-
 .../ops/declarable/helpers/cpu/split.cpp      |    6 +-
 .../ops/declarable/helpers/cpu/top_k.cpp      |   14 +-
 .../ops/declarable/helpers/cpu/transforms.cpp |   16 +-
 .../helpers/cpu/triangular_solve.cpp          |   20 +-
 .../ops/declarable/helpers/cpu/weights.cpp    |    2 +-
 48 files changed, 1084 insertions(+), 1084 deletions(-)

diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h
index fb1582056..680a5f0aa 100644
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@@ -14,9 +14,9 @@
  * SPDX-License-Identifier: Apache-2.0
  ******************************************************************************/
 
-//
-// @author Yurii Shyrma (iuriish@yahoo.com), created on 14.03.2019
-//
+ //
+ // @author Yurii Shyrma (iuriish@yahoo.com), created on 14.03.2019
+ //
 
 #ifndef LIBND4J_LOOPS_H
 #define LIBND4J_LOOPS_H
@@ -45,7 +45,7 @@ namespace nd4j {
     };
 
     template <typename X, typename Z>
-    class ReductionFloatLoops : public ReductionLoops<X,Z,Z> {
+    class ReductionFloatLoops : public ReductionLoops<X, Z, Z> {
     public:
         static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, Z* extraParams, int64_t start, int64_t stop);
 
@@ -54,7 +54,7 @@ namespace nd4j {
     };
 
     template <typename X, typename Z>
-    class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X,Z,X> {
+    class ND4J_EXPORT ReductionBoolLoops : public ReductionLoops<X, Z, X> {
     public:
         static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
@@ -63,7 +63,7 @@ namespace nd4j {
     };
 
     template <typename X, typename Z>
-    class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X,Z,X> {
+    class ND4J_EXPORT ReductionLongLoops : public ReductionLoops<X, Z, X> {
     public:
         static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, Z* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
@@ -72,7 +72,7 @@ namespace nd4j {
     };
 
     template <typename X>
-    class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X,X,X> {
+    class ND4J_EXPORT ReductionSameLoops : public ReductionLoops<X, X, X> {
     public:
         static void wrapper(const int opNum, X* x, Nd4jLong* xShapeInfo, X* z, Nd4jLong* zShapeInfo, Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets, X* extraParams, int64_t start, int64_t stop);
 
@@ -125,158 +125,158 @@ namespace nd4j {
 
 
 
-/*
-//////////////////////////////////////////////////////////////////////////////
-template<typename X, typename Y, typename Z>
-void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
-                    const Y* y, const Nd4jLong* yShapeInfo,
-                          Z* z, const Nd4jLong* zShapeInfo,
-                          Z* extraParams,
-                          std::function<Z(X,Y,Z*)> op) {
+    /*
+    //////////////////////////////////////////////////////////////////////////////
+    template<typename X, typename Y, typename Z>
+    void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
+                        const Y* y, const Nd4jLong* yShapeInfo,
+                              Z* z, const Nd4jLong* zShapeInfo,
+                              Z* extraParams,
+                              std::function<Z(X,Y,Z*)> op) {
 
-    const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
+        const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
 
-    const Nd4jLong* xShape  = shape::shapeOf(xShapeInfo);
-    const Nd4jLong* xStride = shape::stride(xShapeInfo);
-    const Nd4jLong* yStride = shape::stride(yShapeInfo);
-    const Nd4jLong* zStride = shape::stride(zShapeInfo);
+        const Nd4jLong* xShape  = shape::shapeOf(xShapeInfo);
+        const Nd4jLong* xStride = shape::stride(xShapeInfo);
+        const Nd4jLong* yStride = shape::stride(yShapeInfo);
+        const Nd4jLong* zStride = shape::stride(zShapeInfo);
 
-    const Nd4jLong len = shape::length(xShapeInfo);
+        const Nd4jLong len = shape::length(xShapeInfo);
 
-    OmpLaunchHelper threadsInfo(len);
+        OmpLaunchHelper threadsInfo(len);
 
-    switch (kindOfLoop) {
+        switch (kindOfLoop) {
 
-        case LoopKind::EWS1: {
-            PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-            {
-                const auto threadNum = omp_get_thread_num();
-                const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+            case LoopKind::EWS1: {
+                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
+                {
+                    const auto threadNum = omp_get_thread_num();
+                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
+                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
 
-                const auto xi = x + threadOffset;
-                const auto yi = y + threadOffset;
-                          auto zi = z + threadOffset;
+                    const auto xi = x + threadOffset;
+                    const auto yi = y + threadOffset;
+                              auto zi = z + threadOffset;
 
-                PRAGMA_OMP_SIMD
-                for (uint i = 0; i < lenPerThread; i++)
-                    zi[i] = op(xi[i], yi[i], extraParams);
+                    PRAGMA_OMP_SIMD
+                    for (uint i = 0; i < lenPerThread; i++)
+                        zi[i] = op(xi[i], yi[i], extraParams);
+                }
             }
-        }
-            break;
+                break;
 
-        case LoopKind::EWSNONZERO: {
-            const uint xEws = shape::elementWiseStride(xShapeInfo);
-            const uint yEws = shape::elementWiseStride(yShapeInfo);
-            const uint zEws = shape::elementWiseStride(zShapeInfo);
+            case LoopKind::EWSNONZERO: {
+                const uint xEws = shape::elementWiseStride(xShapeInfo);
+                const uint yEws = shape::elementWiseStride(yShapeInfo);
+                const uint zEws = shape::elementWiseStride(zShapeInfo);
 
-            PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-            {
-                const auto threadNum = omp_get_thread_num();
-                const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-                const auto xi = x + threadOffset * xEws;
-                const auto yi = y + threadOffset * yEws;
-                      auto zi = z + threadOffset * zEws;
+                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
+                {
+                    const auto threadNum = omp_get_thread_num();
+                    const auto threadOffset = threadsInfo.getThreadOffset(threadNum);
+                    const auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+                    const auto xi = x + threadOffset * xEws;
+                    const auto yi = y + threadOffset * yEws;
+                          auto zi = z + threadOffset * zEws;
 
-                PRAGMA_OMP_SIMD
-                for (uint i = 0; i < lenPerThread; i++)
-                    zi[i*zEws] = op(xi[i*xEws], yi[i*yEws], extraParams);
+                    PRAGMA_OMP_SIMD
+                    for (uint i = 0; i < lenPerThread; i++)
+                        zi[i*zEws] = op(xi[i*xEws], yi[i*yEws], extraParams);
+                }
             }
-        }
-            break;
+                break;
 
-        case LoopKind::RANK1: {
-            PRAGMA_OMP_PARALLEL_FOR
-            for (uint i0 = 0; i0 < len; ++i0)
-                z[i0 * zStride[0]] = op(x[i0 * xStride[0]], y[i0 * yStride[0]], extraParams);
-        }
-            break;
+            case LoopKind::RANK1: {
+                PRAGMA_OMP_PARALLEL_FOR
+                for (uint i0 = 0; i0 < len; ++i0)
+                    z[i0 * zStride[0]] = op(x[i0 * xStride[0]], y[i0 * yStride[0]], extraParams);
+            }
+                break;
 
-        case LoopKind::RANK2: {
-            PRAGMA_OMP_PARALLEL_FOR_SIMD
-            for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                    z[i0 * zStride[0] + i1 * zStride[1]] = op(x[i0 * xStride[0] + i1 * xStride[1]], y[i0 * yStride[0] + i1 * yStride[1]], extraParams);
-        }
-            break;
+            case LoopKind::RANK2: {
+                PRAGMA_OMP_PARALLEL_FOR_SIMD
+                for (uint i0 = 0; i0 < xShape[0]; ++i0)
+                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
+                        z[i0 * zStride[0] + i1 * zStride[1]] = op(x[i0 * xStride[0] + i1 * xStride[1]], y[i0 * yStride[0] + i1 * yStride[1]], extraParams);
+            }
+                break;
 
-        case LoopKind::RANK3: {
-            PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
-            for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                    for (uint i2 = 0; i2 < xShape[2]; ++i2)
-                        z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]], extraParams);
-        }
-            break;
+            case LoopKind::RANK3: {
+                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(2)
+                for (uint i0 = 0; i0 < xShape[0]; ++i0)
+                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
+                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
+                            z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]], extraParams);
+            }
+                break;
 
-        case LoopKind::RANK4: {
-            PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(3)
-            for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                    for (uint i2 = 0; i2 < xShape[2]; ++i2)
-                        for (uint i3 = 0; i3 < xShape[3]; ++i3)
-                            z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]], extraParams);
-        }
-            break;
+            case LoopKind::RANK4: {
+                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(3)
+                for (uint i0 = 0; i0 < xShape[0]; ++i0)
+                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
+                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
+                            for (uint i3 = 0; i3 < xShape[3]; ++i3)
+                                z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]], extraParams);
+            }
+                break;
 
-        case LoopKind::RANK5: {
-            PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(4)
-            for (uint i0 = 0; i0 < xShape[0]; ++i0)
-                for (uint i1 = 0; i1 < xShape[1]; ++i1)
-                    for (uint i2 = 0; i2 < xShape[2]; ++i2)
-                        for (uint i3 = 0; i3 < xShape[3]; ++i3)
-                            for (uint i4 = 0; i4 < xShape[4]; ++i4)
-                                z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]+i4*zStride[4]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]+i4*xStride[4]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]+i4*yStride[4]], extraParams);
-        }
-            break;
+            case LoopKind::RANK5: {
+                PRAGMA_OMP_PARALLEL_FOR_SIMD_COLLAPSE(4)
+                for (uint i0 = 0; i0 < xShape[0]; ++i0)
+                    for (uint i1 = 0; i1 < xShape[1]; ++i1)
+                        for (uint i2 = 0; i2 < xShape[2]; ++i2)
+                            for (uint i3 = 0; i3 < xShape[3]; ++i3)
+                                for (uint i4 = 0; i4 < xShape[4]; ++i4)
+                                    z[i0*zStride[0]+i1*zStride[1]+i2*zStride[2]+i3*zStride[3]+i4*zStride[4]] = op(x[i0*xStride[0]+i1*xStride[1]+i2*xStride[2]+i3*xStride[3]+i4*xStride[4]], y[i0*yStride[0]+i1*yStride[1]+i2*yStride[2]+i3*yStride[3]+i4*yStride[4]], extraParams);
+            }
+                break;
 
-        default: {
-            uint xShapeInfoCast[MAX_RANK];
-            uint yShapeInfoCast[MAX_RANK];
-            uint zShapeInfoCast[MAX_RANK];
+            default: {
+                uint xShapeInfoCast[MAX_RANK];
+                uint yShapeInfoCast[MAX_RANK];
+                uint zShapeInfoCast[MAX_RANK];
 
-            bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-            bool canCastY = DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
-            bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
+                bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+                bool canCastY = DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
+                bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-            PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
-            {
-                auto threadNum = omp_get_thread_num();
-                auto threadOffset = threadsInfo.getThreadOffset(threadNum);
-                auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
-                PRAGMA_OMP_SIMD
-                for (uint i = 0; i < lenPerThread; i++) {
-                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
-                    auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
-                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
-                    z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
+                PRAGMA_OMP_PARALLEL_THREADS(threadsInfo._numThreads)
+                {
+                    auto threadNum = omp_get_thread_num();
+                    auto threadOffset = threadsInfo.getThreadOffset(threadNum);
+                    auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
+                    PRAGMA_OMP_SIMD
+                    for (uint i = 0; i < lenPerThread; i++) {
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
+                        z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
+                    }
                 }
             }
         }
     }
-}
-*/
+    */
 
 
 
-//////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////////
     template<typename X, typename Z, typename E>
     template <typename OpType>
     void nd4j::ReductionLoops<X, Z, E>::loopReduce(X* x, Nd4jLong* xShapeInfo,
-                                                  Z* z, Nd4jLong* zShapeInfo,
-                                                  Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets,
-                                                  E* extraParams, int64_t start, int64_t stop) {
+        Z* z, Nd4jLong* zShapeInfo,
+        Nd4jLong* tadShapeInfo, Nd4jLong* tadOffsets,
+        E* extraParams, int64_t start, int64_t stop) {
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopTadXZ(xShapeInfo, zShapeInfo, tadShapeInfo);
 
-        const Nd4jLong zLen   = shape::length(zShapeInfo);
+        const Nd4jLong zLen = shape::length(zShapeInfo);
         const Nd4jLong tadLen = shape::length(tadShapeInfo);
 
         const uint tadEws = shape::elementWiseStride(tadShapeInfo);
-        const uint zEws   = shape::elementWiseStride(zShapeInfo);
+        const uint zEws = shape::elementWiseStride(zShapeInfo);
 
-        const Nd4jLong* tadShape  = shape::shapeOf(tadShapeInfo);
+        const Nd4jLong* tadShape = shape::shapeOf(tadShapeInfo);
         const Nd4jLong* tadStride = shape::stride(tadShapeInfo);
 
         int numThreads = OmpLaunchHelper::tadThreads(tadLen, zLen);
@@ -298,192 +298,192 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
             //         printf("%u - %lld\n", i, zOffset);
             //     }
             // }
-            case LoopKind::SMALLARR2DX: {
-                const auto uTadLen        = static_cast<uint>(tadLen);
-                const auto uZLenMinusOne  = static_cast<uint>(zLen - 1);
-                const auto xLen           = static_cast<uint>(zLen * uTadLen);
-                const auto sv             = static_cast<Z>(OpType::startingValue(x));
+        case LoopKind::SMALLARR2DX: {
+            const auto uTadLen = static_cast<uint>(tadLen);
+            const auto uZLenMinusOne = static_cast<uint>(zLen - 1);
+            const auto xLen = static_cast<uint>(zLen * uTadLen);
+            const auto sv = static_cast<Z>(OpType::startingValue(x));
 
-                for (uint i = 0; i <= uZLenMinusOne; i++)
-                    z[i] = OpType::startingValue(x);
+            for (uint i = 0; i <= uZLenMinusOne; i++)
+                z[i] = OpType::startingValue(x);
 
-                uint zOffset = 0;
-                for (uint i = 0; i < xLen; ++i) {
-                    z[zOffset] = OpType::update(z[zOffset], OpType::op(x[i], extraParams), extraParams);
-                    zOffset = zOffset == uZLenMinusOne ? 0 : zOffset + 1;
+            uint zOffset = 0;
+            for (uint i = 0; i < xLen; ++i) {
+                z[zOffset] = OpType::update(z[zOffset], OpType::op(x[i], extraParams), extraParams);
+                zOffset = zOffset == uZLenMinusOne ? 0 : zOffset + 1;
+            }
+
+            for (uint i = 0; i <= uZLenMinusOne; i++)
+                z[i] = OpType::postProcess(z[i], tadLen, extraParams);
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::EWS1: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong j = 0; j < tadLen; j++)
+                    s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
+
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+         //*********************************************//
+        case LoopKind::EWSNONZERO: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong j = 0; j < tadLen; j++)
+                    s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
+
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK1: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong i0 = 0; i0 < tadLen; ++i0)
+                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
+
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+                            break;
+
+                            //*********************************************//
+        case LoopKind::RANK2: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
+                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
+
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK3: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
+                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
+                            s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
+
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK4: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
+                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
+                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
+                                s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
+
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK5: {
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0)
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1)
+                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2)
+                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3)
+                                for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4)
+                                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
+
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::X_EWSNONZERO: {
+            uint castZShapeInfo[MAX_RANK];
+            const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
+
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong j = 0; j < tadLen; j++)
+                    s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
+
+                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
+                z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::Z_EWSNONZERO: {
+            uint castTadShapeInfo[MAX_RANK];
+            const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
+
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
+
+                for (Nd4jLong j = 0; j < tadLen; j++) {
+                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
+                    s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
                 }
 
-                for (uint i = 0; i <= uZLenMinusOne; i++)
-                    z[i] = OpType::postProcess(z[i], tadLen, extraParams);
-            }
-                break;
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::EWS1: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
+        //*********************************************//
+        default: {
+            auto innertadOffsets = new Nd4jLong[tadLen];
+            shape::calcOffsets(tadShapeInfo, innertadOffsets);
 
-                    for (uint j = 0; j < tadLen; j++)
-                        s = OpType::update(s, OpType::op(tad[j], extraParams), extraParams);
+            uint castZShapeInfo[MAX_RANK];
+            const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, castZShapeInfo);
 
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+            for (auto i = start; i < stop; i++) {
+                auto tad = x + tadOffsets[i];
+                auto s = OpType::startingValue(tad);
 
-            //*********************************************//
-            case LoopKind::EWSNONZERO: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
+                for (Nd4jLong j = 0; j < tadLen; j++)
+                    s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
 
-                    for (uint j = 0; j < tadLen; j++)
-                        s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
+                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
+                z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
+            };
 
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::RANK1: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint i0 = 0; i0 < tadLen; ++i0)
-                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0]], extraParams), extraParams);
-
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::RANK2: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0)
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1)
-                            s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1]], extraParams), extraParams);
-
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::RANK3: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0)
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1)
-                            for (uint i2 = 0; i2 < tadShape[2]; ++i2)
-                                s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2]], extraParams), extraParams);
-
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::RANK4: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0)
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1)
-                            for (uint i2 = 0; i2 < tadShape[2]; ++i2)
-                                for (uint i3 = 0; i3 < tadShape[3]; ++i3)
-                                    s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3]], extraParams), extraParams);
-
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::RANK5: {
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0)
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1)
-                            for (uint i2 = 0; i2 < tadShape[2]; ++i2)
-                                for (uint i3 = 0; i3 < tadShape[3]; ++i3)
-                                    for (uint i4 = 0; i4 < tadShape[4]; ++i4)
-                                        s = OpType::update(s, OpType::op(tad[i0 * tadStride[0] + i1 * tadStride[1] + i2 * tadStride[2] + i3 * tadStride[3] + i4 * tadStride[4]], extraParams), extraParams);
-
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::X_EWSNONZERO: {
-                uint castZShapeInfo[MAX_RANK];
-                const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
-
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint j = 0; j < tadLen; j++)
-                        s = OpType::update(s, OpType::op(tad[j * tadEws], extraParams), extraParams);
-
-                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                    z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::Z_EWSNONZERO: {
-                uint castTadShapeInfo[MAX_RANK];
-                const bool canCastTad = nd4j::DataTypeUtils::castShapeInfo<uint>(tadShapeInfo, castTadShapeInfo);
-
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint j = 0; j < tadLen; j++) {
-                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
-                        s = OpType::update(s, OpType::op(tad[tadOffset], extraParams), extraParams);
-                    }
-
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
-
-            //*********************************************//
-            default: {
-                auto innertadOffsets = new Nd4jLong[tadLen];
-                shape::calcOffsets(tadShapeInfo, innertadOffsets);
-
-                uint castZShapeInfo[MAX_RANK];
-                const bool canCastZ   = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo,   castZShapeInfo);
-
-                for (auto i = start; i < stop; i++) {
-                    auto tad = x + tadOffsets[i];
-                    auto s = OpType::startingValue(tad);
-
-                    for (uint j = 0; j < tadLen; j++)
-                        s = OpType::update(s, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
-
-                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
-                    z[zOffset] = OpType::postProcess(s, tadLen, extraParams);
-                };
-
-                delete[] innertadOffsets;
-            }
+            delete[] innertadOffsets;
+        }
         }
     }
 
@@ -492,13 +492,13 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
     //////////////////////////////////////////////////////////////////////////////
     template <typename X, typename Z, typename E>
     template <typename OpType>
-    void nd4j::TransformLoops<X,Z,E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
-                                             Z* z, Nd4jLong* zShapeInfo,
-                                             E* extraParams, uint64_t threadId, uint64_t numThreads) {
+    void nd4j::TransformLoops<X, Z, E>::loopTransform(X* x, Nd4jLong* xShapeInfo,
+        Z* z, Nd4jLong* zShapeInfo,
+        E* extraParams, uint64_t threadId, uint64_t numThreads) {
 
         const LoopKind::Kind kindOfLoop = LoopKind::deduceKindOfLoopXZ(xShapeInfo, zShapeInfo);
 
-        const Nd4jLong* xShape  = shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo));
+        const Nd4jLong* xShape = shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo));
         const Nd4jLong* xStride = shape::stride(const_cast<Nd4jLong*>(xShapeInfo));
         const Nd4jLong* zStride = shape::stride(const_cast<Nd4jLong*>(zShapeInfo));
 
@@ -510,182 +510,183 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
         switch (kindOfLoop) {
 
             //*********************************************//
-            case LoopKind::EWS1: {
-                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
-                    int64_t start = span.startX(), stop = span.stopX();
+        case LoopKind::EWS1: {
+            auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+            int64_t start = span.startX(), stop = span.stopX();
 
-                    for (auto i = start; i < stop; i++)
-                        z[i] = OpType::op(x[i], extraParams);
+            for (auto i = start; i < stop; i++)
+                z[i] = OpType::op(x[i], extraParams);
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::EWSNONZERO: {
+            const uint xEws = shape::elementWiseStride(xShapeInfo);
+            const uint zEws = shape::elementWiseStride(zShapeInfo);
+
+            auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+            int64_t start = span.startX(), stop = span.stopX();
+
+            for (auto i = start; i < stop; i++)
+                z[i * zEws] = OpType::op(x[i * xEws], extraParams);
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::Z_EWSNONZERO: {
+            const uint zEws = shape::elementWiseStride(zShapeInfo);
+            uint castXShapeInfo[MAX_RANK];
+            const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
+
+            auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+            int64_t start = span.startX(), stop = span.stopX();
+
+            if (zEws > 1) {
+                for (auto i = start; i < stop; i++) {
+                    const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
+                    z[i * zEws] = OpType::op(x[xOffset], extraParams);
                 }
-                break;
-
-            //*********************************************//
-            case LoopKind::EWSNONZERO: {
-                    const uint xEws = shape::elementWiseStride(xShapeInfo);
-                    const uint zEws = shape::elementWiseStride(zShapeInfo);
-
-                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
-                    int64_t start = span.startX(), stop = span.stopX();
-
-                    for (auto i = start; i < stop; i++)
-                        z[i*zEws] = OpType::op(x[i*xEws], extraParams);
+            }
+            else {
+                for (auto i = start; i < stop; i++) {
+                    const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
+                    z[i] = OpType::op(x[xOffset], extraParams);
                 }
-                break;
+            }
+        }
+        break;
 
-                //*********************************************//
-            case LoopKind::Z_EWSNONZERO: {
-                    const uint zEws = shape::elementWiseStride(zShapeInfo);
-                    uint castXShapeInfo[MAX_RANK];
-                    const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, castXShapeInfo);
+        //*********************************************//
+        case LoopKind::RANK1: {
+            auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
 
-                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
-                    int64_t start = span.startX(), stop = span.stopX();
+            for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams);
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK2: {
+            auto uXShape0 = static_cast<uint>(xShape[0]);
+            auto uXShape1 = static_cast<uint>(xShape[1]);
+
+            auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
+            auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
+
+            for (auto i0 = span.startX(); i0 < span.stopX(); i0++) {
+                auto z0 = i0 * zStride[0];
+                auto x0 = i0 * xStride[0];
+
+                for (auto i1 = span.startY(); i1 < span.stopY(); ++i1)
+                    z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
+            }
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK3: {
+            auto uXShape0 = xShape[0];
+            auto uXShape1 = xShape[1];
+            auto uXShape2 = xShape[2];
+
+            auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
+            auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
+
+
+            for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                for (auto i1 = span.startY(); i1 < span.stopY(); i1++) {
+                    auto z0 = i0 * zStride[0] + i1 * zStride[1];
+                    auto x0 = i0 * xStride[0] + i1 * xStride[1];
+
+                    for (Nd4jLong i2 = 0; i2 < uXShape2; ++i2)
+                        z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
+                }
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK4: {
+            auto uXShape0 = xShape[0];
+            auto uXShape1 = xShape[1];
+            auto uXShape2 = xShape[2];
+            auto uXShape3 = xShape[3];
+
+            auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
+            auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
+
+            for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
+                    for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
+                        auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
+                        auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
+
+                        for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3)
+                            z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
+                    }
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK5: {
+            auto uXShape0 = xShape[0];
+            auto uXShape1 = xShape[1];
+            auto uXShape2 = xShape[2];
+            auto uXShape3 = xShape[3];
+            auto uXShape4 = xShape[4];
+
+            auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
+            auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
+
+
+            for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
+                for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
+                    for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
+                        auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
+                        auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
+
+                        for (Nd4jLong i3 = 0; i3 < uXShape3; ++i3) {
+
+                            auto z1 = z0 + i3 * zStride[3];
+                            auto x1 = x0 + i3 * xStride[3];
+
+                            for (Nd4jLong i4 = 0; i4 < uXShape4; ++i4)
+                                z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
 
-                    if (zEws > 1) {
-                        for (auto i = start; i < stop; i++) {
-                            const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
-                            z[i * zEws] = OpType::op(x[xOffset], extraParams);
-                        }
-                    } else {
-                        for (auto i = start; i < stop; i++) {
-                            const auto xOffset = shape::indexOffset(i, xShapeInfo, castXShapeInfo, canCastX);
-                            z[i] = OpType::op(x[xOffset], extraParams);
                         }
                     }
-                }
-                break;
 
-                //*********************************************//
-            case LoopKind::RANK1: {
-                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
+        }
+        break;
 
-                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
-                        z[i0 * zStride[0]] = OpType::op(x[i0 * xStride[0]], extraParams);
-                }
-                break;
+        //*********************************************//
+        default: {
+            uint xShapeInfoCast[MAX_RANK];
+            uint zShapeInfoCast[MAX_RANK];
 
-                //*********************************************//
-            case LoopKind::RANK2: {
-                    auto uXShape0 = static_cast<uint>(xShape[0]);
-                    auto uXShape1 = static_cast<uint>(xShape[1]);
+            bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
+            bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
-                    auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
-                    auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
+            auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
 
-                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++) {
-                        auto z0 = i0 * zStride[0];
-                        auto x0 = i0 * xStride[0];
-
-                        for (uint i1 = span.startY(); i1 < span.stopY(); ++i1)
-                            z[z0 + i1 * zStride[1]] = OpType::op(x[x0 + i1 * xStride[1]], extraParams);
-                    }
-                }
-                break;
-
-                //*********************************************//
-            case LoopKind::RANK3: {
-                    auto uXShape0 = static_cast<uint>(xShape[0]);
-                    auto uXShape1 = static_cast<uint>(xShape[1]);
-                    auto uXShape2 = static_cast<uint>(xShape[2]);
-
-                    auto loop = samediff::ThreadsHelper::pickLoop2d(numThreads, uXShape0, uXShape1);
-                    auto span = samediff::Span2::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1);
-
-
-                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
-                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++) {
-                            auto z0 = i0 * zStride[0] + i1 * zStride[1];
-                            auto x0 = i0 * xStride[0] + i1 * xStride[1];
-
-                            for (uint i2 = 0; i2 < uXShape2; ++i2)
-                                z[z0 + i2 * zStride[2]] = OpType::op(x[x0 + i2 * xStride[2]], extraParams);
-                        }
-                }
-                break;
-
-                //*********************************************//
-            case LoopKind::RANK4: {
-                    auto uXShape0 = static_cast<uint>(xShape[0]);
-                    auto uXShape1 = static_cast<uint>(xShape[1]);
-                    auto uXShape2 = static_cast<uint>(xShape[2]);
-                    auto uXShape3 = static_cast<uint>(xShape[3]);
-
-                    auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
-                    auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
-
-                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
-                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
-                            for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
-                                auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
-                                auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
-
-                                for (uint i3 = 0; i3 < uXShape3; ++i3)
-                                    z[z0 + i3 * zStride[3]] = OpType::op(x[x0 + i3 * xStride[3]], extraParams);
-                            }
-                }
-                break;
-
-                //*********************************************//
-            case LoopKind::RANK5: {
-                    auto uXShape0 = static_cast<uint>(xShape[0]);
-                    auto uXShape1 = static_cast<uint>(xShape[1]);
-                    auto uXShape2 = static_cast<uint>(xShape[2]);
-                    auto uXShape3 = static_cast<uint>(xShape[3]);
-                    auto uXShape4 = static_cast<uint>(xShape[4]);
-
-                    auto loop = samediff::ThreadsHelper::pickLoop3d(numThreads, uXShape0, uXShape1, uXShape2);
-                    auto span = samediff::Span3::build(loop, threadId, numThreads, 0, uXShape0, 1, 0, uXShape1, 1, 0, uXShape2, 1);
-
-
-                    for (auto i0 = span.startX(); i0 < span.stopX(); i0++)
-                        for (auto i1 = span.startY(); i1 < span.stopY(); i1++)
-                            for (auto i2 = span.startZ(); i2 < span.stopZ(); i2++) {
-                                auto z0 = i0 * zStride[0] + i1 * zStride[1] + i2 * zStride[2];
-                                auto x0 = i0 * xStride[0] + i1 * xStride[1] + i2 * xStride[2];
-
-                                for (uint i3 = 0; i3 < uXShape3; ++i3) {
-
-                                    auto z1 = z0 + i3 * zStride[3];
-                                    auto x1 = x0 + i3 * xStride[3];
-
-                                    for (uint i4 = 0; i4 < uXShape4; ++i4)
-                                        z[z1 + i4 * zStride[4]] = OpType::op(x[x1 + i4 * xStride[4]], extraParams);
-
-                                }
-                            }
-
-                }
-                break;
-
-            //*********************************************//
-            default: {
-                    uint xShapeInfoCast[MAX_RANK];
-                    uint zShapeInfoCast[MAX_RANK];
-
-                    bool canCastX = DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    bool canCastZ = DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
-
-                    auto span = samediff::Span::build(threadId, numThreads, 0, len, 1);
-
-                    for (auto i = span.startX(); i < span.stopX(); i++) {
-                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
-                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
-                        z[zOffset] = OpType::op(x[xOffset], extraParams);
-                    }
-                }
+            for (auto i = span.startX(); i < span.stopX(); i++) {
+                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+                auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
+                z[zOffset] = OpType::op(x[xOffset], extraParams);
+            }
+        }
 
         }
     }
 
 
-//////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////////
     template<typename X, typename Z>
     template <typename OpType>
     void nd4j::Reduction3Loops<X, Z>::loopReduce3(X* x, Nd4jLong* xShapeInfo,
-                                                  X* y, Nd4jLong* yShapeInfo,
-                                                  Z* z, Nd4jLong* zShapeInfo,
-                                                  int* dims, int dimsLen,
-                                                  Z* extraParameters, int64_t start, int64_t stop) {
+        X* y, Nd4jLong* yShapeInfo,
+        Z* z, Nd4jLong* zShapeInfo,
+        int* dims, int dimsLen,
+        Z* extraParameters, int64_t start, int64_t stop) {
 
         // both tads have same shape, however strides and ews may differ
 
@@ -694,29 +695,29 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
         const Nd4jLong xLen = shape::length(xShapeInfo);
         const Nd4jLong yLen = shape::length(yShapeInfo);
 
-        Nd4jLong *xTadShapeInfo = nullptr, *yTadShapeInfo = nullptr, *xTadOffsets = nullptr, *yTadOffsets = nullptr;
+        Nd4jLong* xTadShapeInfo = nullptr, * yTadShapeInfo = nullptr, * xTadOffsets = nullptr, * yTadOffsets = nullptr;
         TadPack tadPackX, tadPackY;
         std::vector<Nd4jLong> zeroOffsets;
 
-        if(xLen == yLen) {
-            tadPackX      = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
-            tadPackY      = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
+        if (xLen == yLen) {
+            tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
+            tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
             xTadShapeInfo = tadPackX.primaryShapeInfo();
             yTadShapeInfo = tadPackY.primaryShapeInfo();
-            xTadOffsets   = tadPackX.primaryOffsets();
-            yTadOffsets   = tadPackY.primaryOffsets();
+            xTadOffsets = tadPackX.primaryOffsets();
+            yTadOffsets = tadPackY.primaryOffsets();
         }
-        else if(yLen > xLen) {
-            tadPackY      = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
+        else if (yLen > xLen) {
+            tadPackY = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(yShapeInfo, dims, dimsLen);
             xTadShapeInfo = xShapeInfo;
             yTadShapeInfo = tadPackY.primaryShapeInfo();
-            yTadOffsets   = tadPackY.primaryOffsets();
+            yTadOffsets = tadPackY.primaryOffsets();
         }
         else {
-            tadPackX      = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
+            tadPackX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeInfo, dims, dimsLen);
             yTadShapeInfo = yShapeInfo;
             xTadShapeInfo = tadPackX.primaryShapeInfo();
-            xTadOffsets   = tadPackX.primaryOffsets();
+            xTadOffsets = tadPackX.primaryOffsets();
         }
 
 
@@ -724,162 +725,196 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
         const auto xTadEws = shape::elementWiseStride(xTadShapeInfo);
         const auto yTadEws = shape::elementWiseStride(yTadShapeInfo);
-        const auto zEws    = shape::elementWiseStride(zShapeInfo);
+        const auto zEws = shape::elementWiseStride(zShapeInfo);
 
-        const auto zLen   = shape::length(zShapeInfo);
+        const auto zLen = shape::length(zShapeInfo);
         const auto tadLen = shape::length(xTadShapeInfo);
 
-        const auto tadShape    = shape::shapeOf(xTadShapeInfo);
-        const auto xTadStride  = shape::stride(xTadShapeInfo);
-        const auto yTadStride  = shape::stride(xTadShapeInfo);
+        const auto tadShape = shape::shapeOf(xTadShapeInfo);
+        const auto xTadStride = shape::stride(xTadShapeInfo);
+        const auto yTadStride = shape::stride(xTadShapeInfo);
 
         int numThreads = OmpLaunchHelper::tadThreads(tadLen, zLen);
 
         switch (kindOfLoop) {
 
-            //*********************************************//
-            case LoopKind::EWS1: {
-                Z extraParams[3];
-                for (auto i = start; i < stop; i++) {
-                    extraParams[0] = param0;
-                    extraParams[1] = param1;
-                    extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::EWS1: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
 
-                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto s = OpType::startingValue(xTad);
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
 
-                    for (uint j = 0; j < tadLen; ++j)
-                        s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
+                for (Nd4jLong j = 0; j < tadLen; ++j)
+                    s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
 
-                    z[i] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+                z[i] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::EWSNONZERO: {
-                Z extraParams[3];
-                for (auto i = start; i < stop; i++) {
-                    extraParams[0] = param0;
-                    extraParams[1] = param1;
-                    extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::EWSNONZERO: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
 
-                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto s = OpType::startingValue(xTad);
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
 
-                    for (uint j = 0; j < tadLen; ++j)
-                        s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
+                for (Nd4jLong j = 0; j < tadLen; ++j)
+                    s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
 
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK1: {
-                Z extraParams[3];
-                for (auto i = start; i < stop; i++) {
-                    extraParams[0] = param0;
-                    extraParams[1] = param1;
-                    extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK1: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
 
-                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto s = OpType::startingValue(xTad);
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
 
-                    for (uint i0 = 0; i0 < tadLen; ++i0) {
-                        const auto xTadOffset = i0 * xTadStride[0];
-                        const auto yTadOffset = i0 * yTadStride[0];
+                for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
+                    const auto xTadOffset = i0 * xTadStride[0];
+                    const auto yTadOffset = i0 * yTadStride[0];
+                    s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
+                }
+
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK2: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
+
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
+
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                        const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
+                        const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
                         s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                     }
+                }
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+        //*********************************************//
+        case LoopKind::RANK3: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
 
-            //*********************************************//
-            case LoopKind::RANK2: {
-                Z extraParams[3];
-                for (auto i = start; i < stop; i++) {
-                    extraParams[0] = param0;
-                    extraParams[1] = param1;
-                    extraParams[2] = param2;
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
 
-                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto s = OpType::startingValue(xTad);
-
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                            const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
-                            const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
+                            const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
+                            const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
                             s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+                }
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK3: {
-                Z extraParams[3];
-                for (auto i = start; i < stop; i++) {
-                    extraParams[0] = param0;
-                    extraParams[1] = param1;
-                    extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK4: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
 
-                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto s = OpType::startingValue(xTad);
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
 
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                                const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
-                                const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
+                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
+                                const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
+                                const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
                                 s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+                }
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK4: {
-                Z extraParams[3];
-                for (auto i = start; i < stop; i++) {
-                    extraParams[0] = param0;
-                    extraParams[1] = param1;
-                    extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK5: {
+            Z extraParams[3];
+            for (auto i = start; i < stop; i++) {
+                extraParams[0] = param0;
+                extraParams[1] = param1;
+                extraParams[2] = param2;
 
-                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                    auto s = OpType::startingValue(xTad);
+                const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                auto s = OpType::startingValue(xTad);
 
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                                for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                    const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
-                                    const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
+                for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                    for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                        for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
+                            for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
+                                for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
+                                    const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
+                                    const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
                                     s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                 }
                             }
                         }
                     }
-                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                };
-            }
-            break;
+                }
+                z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK5: {
+        //*********************************************//
+        default: {
+            uint castXTadShapeInfo[MAX_RANK];
+            const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
+
+            if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
                 Z extraParams[3];
                 for (auto i = start; i < stop; i++) {
                     extraParams[0] = param0;
@@ -890,83 +925,49 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                     const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
                     auto s = OpType::startingValue(xTad);
 
-                    for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                        for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                            for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                                for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                    for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
-                                        const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
-                                        const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
-                                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
-                                    }
-                                }
-                            }
-                        }
+                    for (Nd4jLong j = 0; j < tadLen; ++j) {
+                        const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
+                        s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
+                    }
+
+                    z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                };
+            }
+            else {
+                uint castYTadShapeInfo[MAX_RANK];
+                const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
+
+                Z extraParams[3];
+                for (auto i = start; i < stop; i++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
+
+                    const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
+                    const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
+                    auto s = OpType::startingValue(xTad);
+
+                    for (Nd4jLong j = 0; j < tadLen; ++j) {
+                        const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
+                        const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
+                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                     }
                     z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
                 };
             }
-            break;
-
-            //*********************************************//
-            default: {
-                uint castXTadShapeInfo[MAX_RANK];
-                const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
-
-                if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
-                    Z extraParams[3];
-                    for (auto i = start; i < stop; i++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
-
-                        const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                        const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                        auto s = OpType::startingValue(xTad);
-
-                        for (uint j = 0; j < tadLen; ++j) {
-                            const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                            s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
-                        }
-
-                        z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                    };
-                }
-                else {
-                    uint castYTadShapeInfo[MAX_RANK];
-                    const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
-
-                    Z extraParams[3];
-                    for (auto i = start; i < stop; i++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
-
-                        const auto xTad = xTadOffsets ? x + xTadOffsets[i] : x;
-                        const auto yTad = yTadOffsets ? y + yTadOffsets[i] : y;
-                        auto s = OpType::startingValue(xTad);
-
-                        for (uint j = 0; j < tadLen; ++j) {
-                            const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                            const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
-                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
-                        }
-                        z[i * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                    };
-                }
-            }
+        }
         }
     }
 
-//////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////////
     template<typename X, typename Z>
     template <typename OpType>
     void nd4j::Reduction3Loops<X, Z>::loopReduce3All(X* x, Nd4jLong* xShapeInfo,
-                                                     X* y, Nd4jLong* yShapeInfo,
-                                                     Z* z, Nd4jLong* zShapeInfo,
-                                                     Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets,
-                                                     Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets,
-                                                     Z* extraParameters, int64_t start, int64_t stop) {
+        X* y, Nd4jLong* yShapeInfo,
+        Z* z, Nd4jLong* zShapeInfo,
+        Nd4jLong* xTadShapeInfo, Nd4jLong* xTadOffsets,
+        Nd4jLong* yTadShapeInfo, Nd4jLong* yTadOffsets,
+        Z* extraParameters, int64_t start, int64_t stop) {
 
         // both tads have same shape, however strides and ews may differ
 
@@ -976,186 +977,223 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
         const auto xTadEws = shape::elementWiseStride(xTadShapeInfo);
         const auto yTadEws = shape::elementWiseStride(yTadShapeInfo);
-        const auto zEws    = shape::elementWiseStride(zShapeInfo);
+        const auto zEws = shape::elementWiseStride(zShapeInfo);
 
-        const auto zLen   = shape::length(zShapeInfo);
+        const auto zLen = shape::length(zShapeInfo);
         const auto tadLen = shape::length(xTadShapeInfo);
 
         const auto numXTads = shape::length(xShapeInfo) / tadLen;
         const auto numYTads = shape::length(yShapeInfo) / tadLen;
 
-        const auto tadShape    = shape::shapeOf(xTadShapeInfo);
-        const auto xTadStride  = shape::stride(xTadShapeInfo);
-        const auto yTadStride  = shape::stride(yTadShapeInfo);
+        const auto tadShape = shape::shapeOf(xTadShapeInfo);
+        const auto xTadStride = shape::stride(xTadShapeInfo);
+        const auto yTadStride = shape::stride(yTadShapeInfo);
 
         const auto startVal = OpType::startingValue(x);
 
-        int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads*numYTads);
+        int numThreads = OmpLaunchHelper::tadThreads(tadLen, numXTads * numYTads);
 
         switch (kindOfLoop) {
-            //*********************************************//
-            case LoopKind::EWS1: {
-                Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::EWS1: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
 
-                        const auto xTad = x + xTadOffsets[ix];
-                        const auto yTad = y + yTadOffsets[iy];
-                        const auto zInd = ix * numYTads + iy;
-                        auto s = startVal;
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
 
-                        for (uint j = 0; j < tadLen; ++j)
-                            s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
+                    for (Nd4jLong j = 0; j < tadLen; ++j)
+                        s = OpType::update(s, OpType::op(xTad[j], yTad[j], extraParams), extraParams);
 
-                        z[zInd] = OpType::postProcess(s, tadLen, extraParams);
+                    z[zInd] = OpType::postProcess(s, tadLen, extraParams);
+                }
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::EWSNONZERO: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
+
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
+
+                    for (Nd4jLong j = 0; j < tadLen; ++j)
+                        s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
+
+                    z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                }
+            };
+        }
+        break;
+
+        //*********************************************//
+        case LoopKind::RANK1: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
+
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
+
+                    for (Nd4jLong i0 = 0; i0 < tadLen; ++i0) {
+                        const auto xTadOffset = i0 * xTadStride[0];
+                        const auto yTadOffset = i0 * yTadStride[0];
+                        s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                     }
-                };
-            }
-            break;
+                    z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                }
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::EWSNONZERO: {
-                Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK2: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
 
-                        const auto xTad = x + xTadOffsets[ix];
-                        const auto yTad = y + yTadOffsets[iy];
-                        const auto zInd = ix * numYTads + iy;
-                        auto s = startVal;
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
 
-                        for (uint j = 0; j < tadLen; ++j)
-                            s = OpType::update(s, OpType::op(xTad[j * xTadEws], yTad[j * yTadEws], extraParams), extraParams);
-
-                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                    }
-                };
-            }
-            break;
-
-            //*********************************************//
-            case LoopKind::RANK1: {
-                Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
-
-                        const auto xTad = x + xTadOffsets[ix];
-                        const auto yTad = y + yTadOffsets[iy];
-                        const auto zInd = ix * numYTads + iy;
-                        auto s = startVal;
-
-                        for (uint i0 = 0; i0 < tadLen; ++i0) {
-                            const auto xTadOffset = i0 * xTadStride[0];
-                            const auto yTadOffset = i0 * yTadStride[0];
+                    for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                            const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
+                            const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
                             s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
-                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                };
-            }
-            break;
+                    z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                }
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK2: {
-                Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK3: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
 
-                        const auto xTad = x + xTadOffsets[ix];
-                        const auto yTad = y + yTadOffsets[iy];
-                        const auto zInd = ix * numYTads + iy;
-                        auto s = startVal;
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
 
-                        for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                            for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                                const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1];
-                                const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1];
+                    for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
+                                const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
+                                const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
                                 s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                };
-            }
-            break;
+                    z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                }
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK3: {
-                Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK4: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
 
-                        const auto xTad = x + xTadOffsets[ix];
-                        const auto yTad = y + yTadOffsets[iy];
-                        const auto zInd = ix * numYTads + iy;
-                        auto s = startVal;
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
 
-                        for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                            for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                                for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                                    const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2];
-                                    const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2];
+                    for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
+                                for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
+                                    const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
+                                    const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
                                     s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                 }
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                };
-            }
-            break;
+                    z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                }
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK4: {
-                Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
-                        extraParams[0] = param0;
-                        extraParams[1] = param1;
-                        extraParams[2] = param2;
+        //*********************************************//
+        case LoopKind::RANK5: {
+            Z extraParams[3];
+            for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                    extraParams[0] = param0;
+                    extraParams[1] = param1;
+                    extraParams[2] = param2;
 
-                        const auto xTad = x + xTadOffsets[ix];
-                        const auto yTad = y + yTadOffsets[iy];
-                        const auto zInd = ix * numYTads + iy;
-                        auto s = startVal;
+                    const auto xTad = x + xTadOffsets[ix];
+                    const auto yTad = y + yTadOffsets[iy];
+                    const auto zInd = ix * numYTads + iy;
+                    auto s = startVal;
 
-                        for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                            for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                                for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                                    for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                        const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3];
-                                        const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3];
+                    for (Nd4jLong i0 = 0; i0 < tadShape[0]; ++i0) {
+                        for (Nd4jLong i1 = 0; i1 < tadShape[1]; ++i1) {
+                            for (Nd4jLong i2 = 0; i2 < tadShape[2]; ++i2) {
+                                for (Nd4jLong i3 = 0; i3 < tadShape[3]; ++i3) {
+                                    for (Nd4jLong i4 = 0; i4 < tadShape[4]; ++i4) {
+                                        const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
+                                        const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
                                         s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                                     }
                                 }
                             }
                         }
-                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
-                };
-            }
-            break;
+                    z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                }
+            };
+        }
+        break;
 
-            //*********************************************//
-            case LoopKind::RANK5: {
+        //*********************************************//
+        default: {
+            uint castXTadShapeInfo[MAX_RANK];
+            const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
+
+            if (shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
                 Z extraParams[3];
-                for (auto ix = 0; ix < numXTads; ix++) {
-                    for (auto iy = 0; iy < numYTads; iy++) {
+                for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                    for (Nd4jLong iy = 0; iy < numYTads; iy++) {
                         extraParams[0] = param0;
                         extraParams[1] = param1;
                         extraParams[2] = param2;
@@ -1165,79 +1203,42 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                         const auto zInd = ix * numYTads + iy;
                         auto s = startVal;
 
-                        for (uint i0 = 0; i0 < tadShape[0]; ++i0) {
-                            for (uint i1 = 0; i1 < tadShape[1]; ++i1) {
-                                for (uint i2 = 0; i2 < tadShape[2]; ++i2) {
-                                    for (uint i3 = 0; i3 < tadShape[3]; ++i3) {
-                                        for (uint i4 = 0; i4 < tadShape[4]; ++i4) {
-                                            const auto xTadOffset = i0 * xTadStride[0] + i1 * xTadStride[1] + i2 * xTadStride[2] + i3 * xTadStride[3] + i4 * xTadStride[4];
-                                            const auto yTadOffset = i0 * yTadStride[0] + i1 * yTadStride[1] + i2 * yTadStride[2] + i3 * yTadStride[3] + i4 * yTadStride[4];
-                                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
-                                        }
-                                    }
-                                }
-                            }
+                        for (Nd4jLong j = 0; j < tadLen; ++j) {
+                            const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
+                            s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                         }
-                        z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
                     }
                 };
             }
-            break;
+            else {
+                uint castYTadShapeInfo[MAX_RANK];
+                const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
-            //*********************************************//
-            default: {
-                uint castXTadShapeInfo[MAX_RANK];
-                const bool canCastXTad = nd4j::DataTypeUtils::castShapeInfo<uint>(xTadShapeInfo, castXTadShapeInfo);
+                Z extraParams[3];
+                for (Nd4jLong ix = 0; ix < numXTads; ix++) {
+                    for (Nd4jLong iy = 0; iy < numYTads; iy++) {
+                        extraParams[0] = param0;
+                        extraParams[1] = param1;
+                        extraParams[2] = param2;
 
-                if(shape::haveSameShapeAndStrides(xTadShapeInfo, yTadShapeInfo)) {
-                    Z extraParams[3];
-                    for (auto ix = 0; ix < numXTads; ix++) {
-                        for (auto iy = 0; iy < numYTads; iy++) {
-                            extraParams[0] = param0;
-                            extraParams[1] = param1;
-                            extraParams[2] = param2;
+                        const auto xTad = x + xTadOffsets[ix];
+                        const auto yTad = y + yTadOffsets[iy];
+                        const auto zInd = ix * numYTads + iy;
+                        auto s = startVal;
 
-                            const auto xTad = x + xTadOffsets[ix];
-                            const auto yTad = y + yTadOffsets[iy];
-                            const auto zInd = ix * numYTads + iy;
-                            auto s = startVal;
-
-                            for (uint j = 0; j < tadLen; ++j) {
-                                const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                                s = OpType::update(s, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
-                            }
-                            z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                        for (Nd4jLong j = 0; j < tadLen; ++j) {
+                            const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
+                            const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
+                            s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
-                    };
-                }
-                else {
-                    uint castYTadShapeInfo[MAX_RANK];
-                    const bool canCastYTad = nd4j::DataTypeUtils::castShapeInfo<uint>(yTadShapeInfo, castYTadShapeInfo);
 
-                    Z extraParams[3];
-                    for (auto ix = 0; ix < numXTads; ix++) {
-                        for (auto iy = 0; iy < numYTads; iy++) {
-                            extraParams[0] = param0;
-                            extraParams[1] = param1;
-                            extraParams[2] = param2;
-
-                            const auto xTad = x + xTadOffsets[ix];
-                            const auto yTad = y + yTadOffsets[iy];
-                            const auto zInd = ix * numYTads + iy;
-                            auto s = startVal;
-
-                            for (uint j = 0; j < tadLen; ++j) {
-                                const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
-                                const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
-                                s = OpType::update(s, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
-                            }
-
-                            z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
-                        }
-                    };
-                }
+                        z[zInd * zEws] = OpType::postProcess(s, tadLen, extraParams);
+                    }
+                };
             }
         }
+        }
     }
 
 
diff --git a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
index f047d1136..f18f0c788 100644
--- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
+++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
@@ -50,12 +50,12 @@ namespace nd4j {
                 1 == zArr.ews() && 'c' == zArr.ordering());
 
             if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) {
-                auto yLen = (uint32_t)yArr.lengthOf();
+                auto yLen = yArr.lengthOf();
                 auto func = PRAGMA_THREADS_FOR{
-                   for (uint32_t i = start; i < stop; i++) {
+                   for (auto i = start; i < stop; i++) {
                        auto rZ = z + (i * yLen);
                        auto v = x[i];
-                       for (uint32_t j = 0; j < yLen; j++) {
+                       for (Nd4jLong j = 0; j < yLen; j++) {
                             rZ[j] = OpType::op(v, y[j]);
                        }
                    }
@@ -74,13 +74,13 @@ namespace nd4j {
 
             if (bSpecialCase && bSpecialCase2) {
 
-                int zDim1 = zArr.sizeAt(-2);
-                int zDim2 = zArr.sizeAt(-1);
+                uint32_t zDim1 = zArr.sizeAt(-2);
+                uint32_t zDim2 = zArr.sizeAt(-1);
 
-                int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
+                uint32_t nLen = zArr.lengthOf() / yArr.sizeAt(-1);
 
                 auto func = PRAGMA_THREADS_FOR{
-                     for (uint32_t total = start; total < stop; total++) {
+                     for (auto total = start; total < stop; total++) {
 
                         uint32_t i = total / zDim1;
                         uint32_t j = total % zDim1;
@@ -127,7 +127,7 @@ namespace nd4j {
                                   yCoords[iy--] = 0;
                               }
                           }
-                        }
+                    }
 
                         const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
                         const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index 62058bd20..7226d00b3 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -184,7 +184,7 @@ namespace functions {
                         const auto oX = x[i];
 
                         PRAGMA_OMP_SIMD
-                        for (unsigned int f = 0; f < loopLength; f++)
+                        for (Nd4jLong f = 0; f < loopLength; f++)
                             oZ[f] = OpType::op(oX, oY[f]);
                     }
                 } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){
@@ -198,7 +198,7 @@ namespace functions {
                         const auto oY = y[i];
 
                         PRAGMA_OMP_SIMD
-                        for (unsigned int f = 0; f < loopLength; f++)
+                        for (Nd4jLong f = 0; f < loopLength; f++)
                             oZ[f] = OpType::op(oX[f], oY);
                     }
                 }
@@ -213,14 +213,14 @@ namespace functions {
                     Nd4jLong  yStrides[3] = { 0,0,0 };
                     nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
 
-                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
-                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+                    uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
 
-                    for (uint32_t index0 = start; index0 < stop; index0++) {
+                    for (auto index0 = start; index0 < stop; index0++) {
 
                         PRAGMA_OMP_SIMD
-                            for (uint32_t index1 = 0; index1 < nSize1; index1++) {
-                                for (uint32_t index2 = 0; index2 < nSize2; index2++) {
+                            for (uint64_t index1 = 0; index1 < nSize1; index1++) {
+                                for (uint64_t index2 = 0; index2 < nSize2; index2++) {
                                     auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2);
                                     auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2);
                                     auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2);
@@ -242,18 +242,18 @@ namespace functions {
                     Nd4jLong  yStrides[4] = { 0,0,0,0 };
                     nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
 
-                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
-                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
-                    uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
+                    uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+                    uint64_t nSize3 = shape::sizeAt(zShapeInfo, 3);
 
-                    for (uint32_t i = start; i < stop; i++) {
+                    for (auto i = start; i < stop; i++) {
 
-                        uint32_t index0 = i / nSize1;
-                        uint32_t index1 = i % nSize1;
+                        uint64_t index0 = i / nSize1;
+                        uint64_t index1 = i % nSize1;
 
                         PRAGMA_OMP_SIMD
-                            for (uint32_t index2 = 0; index2 < nSize2; index2++) {
-                                for (uint32_t index3 = 0; index3 < nSize3; index3++) {
+                            for (uint64_t index2 = 0; index2 < nSize2; index2++) {
+                                for (uint64_t index3 = 0; index3 < nSize3; index3++) {
                                     auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3);
                                     auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3);
                                     auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3);
@@ -279,7 +279,7 @@ namespace functions {
                     uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
                     uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4);
 
-                    for (uint32_t i = start; i < stop; i++) {
+                    for (auto i = start; i < stop; i++) {
 
                         uint32_t index0 = i / nSize1;
                         uint32_t index1 = i % nSize1;
@@ -326,7 +326,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
@@ -344,7 +344,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
@@ -362,7 +362,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
@@ -382,7 +382,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@@ -497,7 +497,7 @@ namespace functions {
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                         auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[offset], oY[offset]);
@@ -515,7 +515,7 @@ namespace functions {
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                         auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[xOffset], oY[offset]);
@@ -533,7 +533,7 @@ namespace functions {
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                         auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[offset], oY[yOffset]);
@@ -553,7 +553,7 @@ namespace functions {
                     auto oY = y + tadOffsets[i];
 
                     PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                         auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
index 8d62b9506..faf6fdff6 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
@@ -183,7 +183,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset], extraParams);
                         }
@@ -200,7 +200,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset], extraParams);
@@ -218,7 +218,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset], extraParams);
@@ -237,7 +237,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset], extraParams);
@@ -257,7 +257,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@@ -357,7 +357,7 @@ namespace functions {
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(x[offset], oY[offset], extraParams);
                         }
@@ -375,7 +375,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset], extraParams);
@@ -394,7 +394,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset], extraParams);
@@ -413,7 +413,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset], extraParams);
@@ -434,7 +434,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp
index 9dcce7545..9737cb4bb 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp
@@ -177,7 +177,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
@@ -194,7 +194,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
@@ -212,7 +212,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
@@ -230,7 +230,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
@@ -250,7 +250,7 @@ namespace functions {
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@@ -347,7 +347,7 @@ namespace functions {
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(x[offset], oY[offset]);
                         }
@@ -364,7 +364,7 @@ namespace functions {
                         auto oZ = z + zTadOffset[i];
                         auto oY = y + tadOffsets[i];
 
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
@@ -382,7 +382,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                             auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
@@ -400,7 +400,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
@@ -420,7 +420,7 @@ namespace functions {
                         auto oY = y + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                             auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
diff --git a/libnd4j/include/loops/cpu/indexreduce.hpp b/libnd4j/include/loops/cpu/indexreduce.hpp
index 8d3af7eb4..f875e170b 100644
--- a/libnd4j/include/loops/cpu/indexreduce.hpp
+++ b/libnd4j/include/loops/cpu/indexreduce.hpp
@@ -124,7 +124,7 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
             return;
         const auto indexValue = OpType::startingIndexValue(x);
 
-        for (uint i = 0; i < zLen; i++)
+        for (Nd4jLong i = 0; i < zLen; i++)
             z[i] = (Z) indexValue.index;
 
         return;
diff --git a/libnd4j/include/loops/cpu/random.hpp b/libnd4j/include/loops/cpu/random.hpp
index ab9793694..3b9b3c515 100644
--- a/libnd4j/include/loops/cpu/random.hpp
+++ b/libnd4j/include/loops/cpu/random.hpp
@@ -93,7 +93,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@@ -111,7 +111,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                         auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@@ -129,7 +129,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@@ -149,7 +149,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@@ -197,7 +197,7 @@ namespace functions {
                 else{
                     auto func = PRAGMA_THREADS_FOR {
                         PRAGMA_OMP_SIMD
-                        for (uint64_t i = start; i < stop; i++)  {
+                        for (auto i = start; i < stop; i++)  {
                             auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                         }
@@ -213,7 +213,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                         auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                         auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@@ -255,7 +255,7 @@ namespace functions {
 
                 auto func = PRAGMA_THREADS_FOR {
                     PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                         auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[offset] = OpClass::op(i, length, rng, extraArguments);
                     }
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
index 1ee820853..79eb9b209 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@@ -55,7 +55,7 @@ namespace functions {
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
             }
@@ -68,7 +68,7 @@ namespace functions {
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                for (auto i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                     startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                 z[0] = OpType::postProcess(startingValue, length, extraParams);
@@ -94,7 +94,7 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     return OpType::postProcess(startingValue, length, extraParams);
@@ -156,7 +156,7 @@ namespace functions {
                         return;
                     const auto startingVal = OpType::startingValue(x);
 
-                    for (uint i = 0; i < resultLength; i++)
+                    for (Nd4jLong i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
                 }
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
index d0a80a3f5..4437f52c0 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
@@ -59,7 +59,7 @@ namespace functions {
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                     z[i] = startingVal;
 
                 return;
@@ -113,7 +113,7 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     return OpType::postProcess(startingValue, length, extraParams);
@@ -184,7 +184,7 @@ namespace functions {
                         return;
                     const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
 
-                    for (uint i = 0; i < resultLength; i++)
+                    for (Nd4jLong i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
                 }
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
index e53c9ac8e..08664fcab 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@@ -55,7 +55,7 @@ namespace functions {
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
             }
@@ -110,7 +110,7 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     return OpType::postProcess(startingValue, length, extraParams);
@@ -173,7 +173,7 @@ namespace functions {
                         return;
                     const auto startingVal = OpType::startingValue(x);
 
-                    for (uint i = 0; i < resultLength; i++)
+                    for (Nd4jLong i = 0; i < resultLength; i++)
                         z[i] = startingVal;
                     return;
                 }
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
index 929d9c4ff..e546f71ee 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@@ -57,7 +57,7 @@ namespace functions {
                     return;
                 const auto startingVal = OpType::startingValue(x);
 
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                     z[i] = startingVal;
                 return;
             }
@@ -111,7 +111,7 @@ namespace functions {
                     uint xShapeInfoCast[MAX_RANK];
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                         startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     return OpType::postProcess(startingValue, length, extraParams);
@@ -182,7 +182,7 @@ namespace functions {
                         return;
                     const auto startingVal = OpType::startingValue(x);
 
-                    for (uint i = 0; i < zLength; i++)
+                    for (Nd4jLong i = 0; i < zLength; i++)
                         z[i] = startingVal;
                     return;
                 }
diff --git a/libnd4j/include/loops/cpu/reduce3.hpp b/libnd4j/include/loops/cpu/reduce3.hpp
index c24a3d474..39cd63754 100644
--- a/libnd4j/include/loops/cpu/reduce3.hpp
+++ b/libnd4j/include/loops/cpu/reduce3.hpp
@@ -53,7 +53,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
             return;
         const auto startingVal = OpType::startingValue(x);
 
-        for (uint i = 0; i < length; i++)
+        for (Nd4jLong i = 0; i < length; i++)
             z[i] = startingVal;
 
         return;
diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp
index 071913e22..17a7e88d2 100644
--- a/libnd4j/include/loops/cpu/scalar.hpp
+++ b/libnd4j/include/loops/cpu/scalar.hpp
@@ -73,7 +73,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
             auto oX = x + xTadOffsets[r];
 
             PRAGMA_OMP_SIMD
-            for (unsigned int f = 0; f < tadLength; f++)
+            for (int f = 0; f < tadLength; f++)
                 oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
         };
     }
@@ -83,7 +83,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
             auto oX = x + xTadOffsets[r];
 
             PRAGMA_OMP_SIMD
-            for (unsigned int f = 0; f < tadLength; f++)
+            for (int f = 0; f < tadLength; f++)
                 oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
         };
     }
diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp
index d6dce445b..83e14ae66 100644
--- a/libnd4j/include/loops/cpu/scalar_bool.cpp
+++ b/libnd4j/include/loops/cpu/scalar_bool.cpp
@@ -74,7 +74,7 @@ namespace functions {
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
                 };
             }
@@ -84,7 +84,7 @@ namespace functions {
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
                 };
             }
diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp
index e9f96ff70..5fa51f765 100644
--- a/libnd4j/include/loops/cpu/scalar_int.cpp
+++ b/libnd4j/include/loops/cpu/scalar_int.cpp
@@ -74,7 +74,7 @@ namespace functions {
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
                 };
             }
@@ -84,7 +84,7 @@ namespace functions {
                     auto oX = x + xTadOffsets[r];
 
                     PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
                 };
             }
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index 2e36b8085..ec3c847ec 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -91,7 +91,7 @@ namespace functions {
             uint xShapeInfoCast[MAX_RANK];
             const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
-            for (uint64_t i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                 auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
 
                 SummaryStatsData<X> curr;
@@ -116,7 +116,7 @@ namespace functions {
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
             auto extraParams = reinterpret_cast<Z *>(vextraParams);
-            int resultLength = shape::length(zShapeInfo);
+            auto resultLength = shape::length(zShapeInfo);
 
             if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
                if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
@@ -124,7 +124,7 @@ namespace functions {
                 SummaryStatsData<X> comp;
                 comp.initWithValue(x[0]);
 
-                for (uint i = 0; i < resultLength; i++)
+                for (Nd4jLong i = 0; i < resultLength; i++)
                     z[i] = OpType::getValue(biasCorrected, comp);
                 return;
             }
@@ -166,14 +166,14 @@ namespace functions {
                     comp.initWithValue(tx[0]);
 
                     if (tadEWS == 1 && tadOrder == 'c') {
-                        for (int i = 1; i < tadLength; i++) {
+                        for (Nd4jLong i = 1; i < tadLength; i++) {
                             SummaryStatsData <X> indexVal2;
                             indexVal2.initWithValue(tx[i]);
 
                             comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
                         }
                     } else {
-                        for (int i = 1; i < tadLength; i++) {
+                        for (Nd4jLong i = 1; i < tadLength; i++) {
                             auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
 
                             SummaryStatsData <X> indexVal2;
diff --git a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
index 3cf088ae9..ca9622af9 100644
--- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@@ -61,7 +61,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
     else
         axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
 
-    const int numOfAxes = axes.size();
+    const uint numOfAxes = axes.size();
     REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
 
     // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
@@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
         REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
 
     // types of all input arrays should be the same
-    for(int i = 1; i < block.width(); ++i)
+    for(unsigned long i = 1; i < block.width(); ++i)
         REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM op: types of all input arrays should be the same !");
 
     nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0);
@@ -167,7 +167,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     else
         axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
 
-    const int numOfAxes = axes.size();
+    const uint numOfAxes = axes.size();
     REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_BP op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
 
     // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
@@ -191,7 +191,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
     REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str());
 
     // types of all input arrays should be the same (except dLdO)
-    for(int i = 1; i < block.width() - 2; ++i)
+    for(unsigned long i = 1; i < block.width() - 2; ++i)
         REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
 
     // ***** calculations ***** //
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
index baf19de10..ee45d46a7 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@@ -30,7 +30,7 @@ namespace helpers {
         int* pRowCounts = reinterpret_cast<int*>(rowCounts.buffer());
         int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
         int const* pCols = reinterpret_cast<int const*>(colP->getBuffer());
-        for (int n = 0; n < N; n++) {
+        for (Nd4jLong n = 0; n < N; n++) {
             int begin = pRows[n];//->e<int>(n);
             int end = pRows[n + 1];//rowP->e<int>(n + 1);
             for (int i = begin; i < end; i++) {
@@ -72,7 +72,7 @@ namespace helpers {
         int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
         int* symRowP = reinterpret_cast<int*>(outputRows->buffer());
         symRowP[0] = 0;
-        for (int n = 0; n < N; n++)
+        for (Nd4jLong n = 0; n < N; n++)
             symRowP[n + 1] = symRowP[n] + rowCounts->e<int>(n);
 //        outputRows->printBuffer("output rows");
 
@@ -86,7 +86,7 @@ namespace helpers {
         std::vector<int> offset(N);// = NDArrayFactory::create<int>('c', {N});
 
 //PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset))
-        for (int n = 0; n < N; n++) {
+        for (Nd4jLong n = 0; n < N; n++) {
             int begin = pRows[n];
             int bound = pRows[n + 1];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index 2e63c9d5e..738da9bc5 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -146,17 +146,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
         auto length = shape::length(inShapeInfo);
 
         if (inEWS == 1) {
-            for (int i = 0; i < length; i++)
+            for (Nd4jLong i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
 
             PRAGMA_OMP_SIMD_SUM(sum)
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                 outBuff[i] = nd4j::math::nd4j_exp<T,T>(inBuff[i] - max);
                 sum += outBuff[i];
             }
 
             PRAGMA_OMP_SIMD
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                 outBuff[i] /= sum;
                 outBuff[i] = nd4j::math::nd4j_log<T,T>(outBuff[i]);
             }
@@ -164,17 +164,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
         else if (inEWS > 1) {
 
             PRAGMA_OMP_SIMD_MAX(max)
-            for (int i = 0; i < length; i++)
+            for (Nd4jLong i = 0; i < length; i++)
                 max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
 
             PRAGMA_OMP_SIMD_SUM(sum)
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                 outBuff[i * inEWS] = nd4j::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max);
                 sum += outBuff[i * inEWS];
             }
 
             PRAGMA_OMP_SIMD
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                 outBuff[i * inEWS] /= sum;
                 outBuff[i * inEWS] = nd4j::math::nd4j_log<T, T>(outBuff[i * inEWS]);
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
index 39e51f6d7..bfa1d5a32 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@@ -443,7 +443,7 @@ namespace nd4j {
 					const X* bias_new;
 					X* bias_extra = nullptr;
 					size_t total_num = 1;
-					for (size_t i = 0; i < rank; i++) {
+					for (Nd4jLong i = 0; i < rank; i++) {
 						total_num *= bases[i];
 					}
 					Nd4jLong inc;
@@ -574,7 +574,7 @@ namespace nd4j {
 					for (size_t i = 0; i < 2; i++) {
 						numNC *= bases[i];
 					}
-					for (size_t i = 2; i < rank; i++) {
+					for (Nd4jLong i = 2; i < rank; i++) {
 						numHW *= bases[i];
 					}
 					Nd4jLong total_num = numNC * numHW;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
index eb56acb9c..f082cd248 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
@@ -27,7 +27,7 @@ namespace helpers {
 
     void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
         output.resize(axisVector->lengthOf());
-        for (int e = 0; e < axisVector->lengthOf(); e++) {
+        for (Nd4jLong e = 0; e < axisVector->lengthOf(); e++) {
                 auto ca = axisVector->e<int>(e);
                 if (ca < 0)
                     ca += rank;
@@ -37,7 +37,7 @@ namespace helpers {
     }
 
     void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
-        for (int e = 0; e < axisVector.size(); e++) {
+        for (size_t e = 0; e < axisVector.size(); e++) {
             auto a = axisVector[e];
             if (a < 0)
                 axisVector[e] = a + rank;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index aa9624600..32824684f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -66,7 +66,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
         Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
         Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
 
-        for (int j = 0; j < lenSmall; ++j) {
+        for (Nd4jLong j = 0; j < lenSmall; ++j) {
 
             const bool isOwner = (j < info._numThreads) ? thread_id == j : thread_id == (j % info._numThreads);
 
@@ -96,7 +96,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
                 shape::outerArrayOffsets(zOffsets, j, output->getShapeInfo(), mean->getShapeInfo(), auxBuff, dimsToExclude.data());
 
             PRAGMA_OMP_SIMD
-            for (uint i = 0; i < steps; ++i)
+            for (Nd4jLong i = 0; i < steps; ++i)
                 z[zOffsets[i]] = (x[xOffsets[i]] - meanVal) * sigmaInvGam + betaVal;
         }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
index 26f82bdd9..5573bb8f6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@@ -65,8 +65,8 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
             T *col, *im;
             int imRow, imCol;
 
-            for (uint b = start_x; b < stop_x; b += inc_x) {
-                for (uint c = start_y; c < stop_y; c += inc_y) {
+            for (auto b = start_x; b < stop_x; b += inc_x) {
+                for (auto c = start_y; c < stop_y; c += inc_y) {
                     for (int kRow = 0; kRow < kH; ++kRow) {
                         for (int kCol = 0; kCol < kW; ++kCol) {
                             for (int colH = 0; colH < oH; ++colH) {
@@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
         auto func = PRAGMA_THREADS_FOR {
             T *col, *im;
 
-            for (uint b = start; b < stop; b++) {
+            for (auto b = start; b < stop; b++) {
                 T *im0 = imBuff + b * imStride0;
                 T *col4 = colBuff + b * colStride0;
                 for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
index 6e801b1fa..b17d33db3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
@@ -55,8 +55,8 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
 
     auto func = PRAGMA_THREADS_FOR_2D {
 
-        for (uint b = start_x; b < stop_x; b += inc_x) {
-            for (uint oh = start_y; oh < stop_y; oh += inc_y) {
+        for (auto b = start_x; b < stop_x; b += inc_x) {
+            for (auto oh = start_y; oh < stop_y; oh += inc_y) {
                 for (uint ow = 0; ow < oW; ++ow) {
                     for (uint c = 0; c < iC; ++c) {
 
@@ -70,7 +70,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
                                 const int iw = ow * sW - pW + kw * dW;
                                 if (iw < 0 || iw >= iW) continue;
 
-                                uint xCoords[4] = {b,  (uint)ih, (uint)iw, c};
+                                uint xCoords[4] = { static_cast<uint>(b),  static_cast<uint>(ih), static_cast<uint>(iw), c};
                                 uint yCoords[3] = {kh, kw, c};
 
                                 const X val = x[shape::getOffset(xShapeInfo, xCoords)] + y[shape::getOffset(yShapeInfo, yCoords)];
@@ -79,7 +79,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
                             }
                         }
 
-                        uint zCoords[4] = {b,  oh, ow, c};
+                        uint zCoords[4] = { static_cast<uint>(b),  static_cast<uint>(oh), ow, c};
                         z[shape::getOffset(zShapeInfo, zCoords)] = static_cast<Z>(max);
                     }
                 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
index a470f140a..e529ab84f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@@ -63,7 +63,7 @@ namespace helpers {
             std::vector<Nd4jLong> dims(reduceShape->lengthOf());
 
             bool fit = true;
-            for( int i = 0; i < dims.size(); i++ ) {
+            for(auto i = 0; i < dims.size(); i++ ) {
                 if (fit) {
                     dims[i] = reduceShape->e<Nd4jLong>(i);
                     for (int e = 0; e < input->rankOf(); ++e)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
index 0673a6f2b..3030b1255 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@@ -53,7 +53,7 @@ namespace nd4j {
                         outputs[i].second = 0;
 
                         //PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-                        for (int e = 0; e < indices->lengthOf(); ++e)
+                        for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                             if ((*indices).e<Nd4jLong>(e) == i)
                                 listOutForCurrent.at(outputs[i].second++)->assign(listOfTensors.at(e));
                     }
@@ -65,7 +65,7 @@ namespace nd4j {
                         for (auto i = start; i < stop; i++) {
                             outputs[i].first = outputList[i];
                             outputs[i].second = 0;
-                            for (int e = 0; e < indices->lengthOf(); ++e)
+                            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                                 if (indices->e<Nd4jLong>(e) == i)
                                     outputs[i].first->p(outputs[i].second++, input->e<T>(e));
                         }
@@ -83,7 +83,7 @@ namespace nd4j {
                     for (int e = 0; e < numOfData; e++) {
                         auto data = inputs[e];
                         auto index = indices[e];
-                        for (int i = 0; i < index->lengthOf(); i++) {
+                        for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
                             Nd4jLong pos = index->e<Nd4jLong>(i);
                             if (pos < 0) {
                                 nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
@@ -100,7 +100,7 @@ namespace nd4j {
                 }
                 else {
                     std::vector<int> restDims(output->rankOf() - 1);
-                    for (int i = restDims.size(); i > 0;  i--)
+                    for (auto i = restDims.size(); i > 0;  i--)
                         restDims[restDims.size() - i] = output->rankOf() - i;
 
                     ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
@@ -109,12 +109,12 @@ namespace nd4j {
                         auto data = inputs[e];
                         auto index = indices[e];
                         std::vector<int> sourceDims(data->rankOf() - index->rankOf());
-                        for (int i = sourceDims.size(); i > 0;  i--)
+                        for (auto i = sourceDims.size(); i > 0;  i--)
                             sourceDims[sourceDims.size() - i] = data->rankOf() - i;
 
                         ResultSet listOfTensors = data->allTensorsAlongDimension(sourceDims)    ;
 
-                        for (int i = 0; i < index->lengthOf(); i++) {
+                        for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
                             auto pos = index->e<Nd4jLong>(i);
                             if (pos < 0) {
                                 nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
@@ -146,7 +146,7 @@ namespace nd4j {
 
                     ResultSet listOfTensors = outputList[0]->allTensorsAlongDimension(sourceDims);
 
-                    for (unsigned int i = 0; i < inputGradientList.size(); i++) {
+                    for (auto i = 0; i < inputGradientList.size(); i++) {
                         outputs[i].first = inputGradientList[i];
                         if (outputs[i].first->rankOf() < 1) continue; // skip empty gradient outs
                         std::vector<int> outDims(outputs[i].first->rankOf() - 1);
@@ -158,7 +158,7 @@ namespace nd4j {
 
                         outputs[i].second = 0;
 
-                        for (int e = 0; e < indices->lengthOf(); ++e)
+                        for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                             if (indices->e<Nd4jLong>(e) == i)
                                 listOfTensors.at(e)->assign(listOutForCurrent.at(outputs[i].second++));
                     }
@@ -171,7 +171,7 @@ namespace nd4j {
                         for (auto i = start; i < stop; i++) {
                             outputs[i].first = inputGradientList[i];
                             outputs[i].second = 0;
-                            for (int e = 0; e < indices->lengthOf(); ++e)
+                            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                                 if (indices->e<Nd4jLong>(e) == i)
                                     output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
                         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
index 30d4d3f7a..d43cd716f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
@@ -45,7 +45,7 @@ namespace nd4j {
                     auto xShapeInfo = inputs[e]->shapeInfo();
                     auto xLength = inputs[e]->lengthOf();
 
-                    for (uint i = 0; i < xLength; i++)
+                    for (Nd4jLong i = 0; i < xLength; i++)
                         z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
                 }
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
index beb48e382..6ece88ae6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@@ -26,7 +26,7 @@ namespace nd4j {
         namespace helpers {
             template <typename T>
             static void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) {
-                auto blockSize = 32;
+                Nd4jLong blockSize = 32;
                 auto length = array.lengthOf();
                 int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1);
                 auto tempA = NDArrayFactory::create<Nd4jLong>('c', {numBlocks}, context);
@@ -42,11 +42,11 @@ namespace nd4j {
 
                 // we divide array into 32 element chunks, and store intermediate results once
                 auto func = PRAGMA_THREADS_FOR {
-                    for (auto b = 0; b < stop; b++) {
+                    for (auto b = start; b < stop; b++) {
                         auto blockBuffer = buffer + b * numBlocks;
 
                         Nd4jLong r = 1;
-                        for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
+                        for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
                             auto v = longBytes<T>(blockBuffer[e]);
                             r = 31 * r + v;
                         }
@@ -68,7 +68,7 @@ namespace nd4j {
                             auto blockBuffer = tempBuffer + b * numBlocks;
 
                             Nd4jLong r = 1;
-                            for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
+                            for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
                                 auto v = longBytes<T>(blockBuffer[e]);
                                 r = 31 * r + v;
                             }
@@ -103,4 +103,3 @@ namespace nd4j {
         }
     }
 }
-
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
index 911230367..8720b53d9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
@@ -49,7 +49,7 @@ namespace nd4j {
                     }
 
                     PRAGMA_OMP_SIMD
-                    for (int x = 0; x < numBins; x++) {
+                    for (Nd4jLong x = 0; x < numBins; x++) {
                         result[x] += bins[x];
                     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
index 7be34e6ca..43fa52d34 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
@@ -64,8 +64,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
     if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
 
         auto func = PRAGMA_THREADS_FOR_2D {
-            for (int b = start_x; b < stop_x; b++) {
-                for (int c = start_y; c < stop_y; c++) {
+            for (auto b = start_x; b < stop_x; b++) {
+                for (auto c = start_y; c < stop_y; c++) {
                     for (int kRow = 0; kRow < kH; ++kRow) {
                         for (int kCol = 0; kCol < kW; ++kCol) {
                             for (int colH = 0; colH < oH; ++colH) {
@@ -98,8 +98,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
             T *col, *im;
             int imRow, imCol;
 
-            for (int b = start_x; b < stop_x; b += inc_x) {
-                for (int colH = start_y; colH < stop_y; colH += inc_y) {
+            for (auto b = start_x; b < stop_x; b += inc_x) {
+                for (auto colH = start_y; colH < stop_y; colH += inc_y) {
                     for (int colW = 0; colW < oW; ++colW) {
                         for (int c = 0; c < iC; ++c) {
                             for (int kRow = 0; kRow < kH; ++kRow) {
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
index 23acab375..18b52925a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@@ -219,16 +219,16 @@ namespace helpers {
         auto func = PRAGMA_THREADS_FOR {
             for (auto batch = start; batch < stop; ++batch) {
                 auto pInput = pInputBuf + batch * inBatchNumValues;
-                for (auto y = 0; y < outHeight; ++y) {
+                for (Nd4jLong y = 0; y < outHeight; ++y) {
                     auto pOutput = pOutputBuf + (batch * outHeight + y) * outRowSize;
                     const T* ysInputLowerPtr = pInput + ys[y]._bottomIndex * inRowSize;
                     const T* ysInputUpperPtr = pInput + ys[y]._topIndex * inRowSize;
                     double yVal = ys[y]._interpolarValue;
-                    for (auto x = 0; x < outWidth; ++x) {
+                    for (Nd4jLong x = 0; x < outWidth; ++x) {
                         auto xsBottom = xsPtr[x]._bottomIndex;
                         auto xsTop = xsPtr[x]._topIndex;
                         auto xVal = xsPtr[x]._interpolarValue;
-                        for (auto c = 0; c < channels; ++c) {
+                        for (Nd4jLong c = 0; c < channels; ++c) {
                             double topLeft(ysInputLowerPtr[xsBottom + c]);
                             double topRight(ysInputLowerPtr[xsTop + c]);
                             double bottomLeft(ysInputUpperPtr[xsBottom + c]);
@@ -310,14 +310,14 @@ namespace helpers {
                     if (halfPixelCenter) {
                         inY = nd4j::math::nd4j_max(0LL, inY);
                     }
-                    for (auto x = 0; x < outWidth; ++x) {
+                    for (Nd4jLong x = 0; x < outWidth; ++x) {
                         auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(x, st.widthScale)));
                         Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1);
                         if (halfPixelCenter) {
                             inX = nd4j::math::nd4j_max(0LL, inX);
                         }
                         // copy pixel over all channels
-                        for (auto e = 0; e < channels; e++)
+                        for (Nd4jLong e = 0; e < channels; e++)
                             output->t<T>(b, y, x, e) = images->t<T>(b, inY, inX, e);
                     }
                 }
@@ -613,7 +613,7 @@ namespace helpers {
             for (auto b = start; b < stop; ++b) {
                 auto pInput = inputPtr + b * inBatchWidth;
 
-                for (auto y = 0; y < outHeight; ++y) {
+                for (Nd4jLong y = 0; y < outHeight; ++y) {
                     auto pOutput = &pOutputY[(b * outHeight + y) * outWidth * numChannels];
 
                     WeightsAndIndices yWai;
@@ -635,7 +635,7 @@ namespace helpers {
                         F cached_value_0[4] = {0};
                         F cached_value_1[4] = {0};
                         F cached_value_2[4] = {0};
-                        for (auto x = 0; x < resizerState.outWidth; ++x) {
+                        for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
                             const WeightsAndIndices &xWai = xWais[x];
                             // Shift values in cached_value_* to fill first '_advance' values.
                             switch (xWai._advance) {
@@ -712,7 +712,7 @@ namespace helpers {
                                             xWai._weight2, xWai._weight3);
                         }
                     } else {
-                        for (auto x = 0; x < resizerState.outWidth; ++x) {
+                        for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
                             const WeightsAndIndices &xWai = xWais[x];
                             // Shift values in cachedValue to fill first '_advance' values.
                             switch (xWai._advance) {
@@ -828,7 +828,7 @@ namespace helpers {
         float sum_0 = 0;
         float sum_1 = 0;
         float sum_2 = 0;
-        for (int i = 0; i < yPtrs.size(); ++i) {
+        for (size_t i = 0; i < yPtrs.size(); ++i) {
             const T* ptr = yPtrs[i].yPtr;
             float scaleX = xCache.startScale;
             Nd4jLong offset = 3 * boundIfNeeded(xCache.start, st.inWidth);
@@ -879,7 +879,7 @@ namespace helpers {
             const auto numChannels = st.channels;
             for (Nd4jLong c = 0; c < numChannels; ++c) {
                 float sum = 0;
-                for (int i = 0; i < yPtrs.size(); ++i) {
+                for (size_t i = 0; i < yPtrs.size(); ++i) {
                     T const* ptr = yPtrs[i].yPtr;
                     float scaleX = xCache.startScale;
                     float sumY = static_cast<float>(ptr[numChannels * boundIfNeeded(xCache.start, st.inWidth) + c]) * scaleX;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
index aeb9e38b0..226e3ceed 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
     if(inTadEws == 1 && outTadEws == 1) {
         
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i++) {
+            for (auto i = start; i < stop; i++) {
                 const T *x = inBuff + inTadOffsets[i];
                 T *y = outBuff + outTadOffsets[i];
 
@@ -70,7 +70,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
 
                 // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                     const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
                     const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@@ -100,7 +100,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
     }
     else {
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = 0; i < numOfTads; ++i) {
+            for (Nd4jLong i = 0; i < numOfTads; ++i) {
                 const T *x = inBuff + inTadOffsets[i];
                 T *y = outBuff + outTadOffsets[i];
 
@@ -108,7 +108,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
 
                 // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                     const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
                     const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@@ -179,13 +179,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     if(inTadEws == 1 && gradITadEws == 1) {
         
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i++) {
+            for (auto i = start; i < stop; i++) {
                 const X *x = inBuff + inTadOffsets[i];
                       Y *y = gradIBuff + gradITadOffsets[i];
 
                 // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                     const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
                     const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@@ -208,7 +208,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
 
                 Y prev = 0;
                 // second loop calculates derivatives using information gained in first loop above
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                     const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
                     const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@@ -247,13 +247,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
     else {
 
         auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i++) {
+            for (auto i = start; i < stop; i++) {
                 const X *x = inBuff + inTadOffsets[i];
                       Y *y = gradIBuff + gradITadOffsets[i];
 
                 // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                 // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                     const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
                     const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@@ -280,7 +280,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
 
                 Y prev = 0;
                 // second loop calculates derivatives using information gained in first loop above
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                     const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                     const uint last = depth + j + 1;
                     const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
index 634d875d2..47c5c2a22 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
     auto h_ = h->bufferAsT<T>();
 
     auto func = PRAGMA_THREADS_FOR {
-        for (uint e = start; e < stop; e++) {
+        for (auto e = start; e < stop; e++) {
             c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
             h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
index fbab49e80..53531dd17 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
@@ -32,7 +32,7 @@ namespace helpers {
         Nd4jLong preLastDim = input->rankOf() - 2;
         ResultSet listOut = output->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
         ResultSet listDiag = input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
-        for (Nd4jLong e = 0; e < listOut.size(); ++e) {
+        for (Nd4jLong e = 0; e < static_cast<Nd4jLong>(listOut.size()); ++e) {
             NDArray* inputMatrix = listDiag.at(e);
             NDArray* outputMatrix = listOut.at(e);
             if (outputMatrix != inputMatrix) // if not inplace
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
index 43c65f14b..233d7d972 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
@@ -68,7 +68,7 @@ namespace nd4j {
                     if (shape::elementWiseStride(xShapeInfo) == 1 && shape::elementWiseStride(zShapeInfo) == 1 &&
                         shape::order(xShapeInfo) == 'c' && shape::order(zShapeInfo) == 'c') {
 
-                        for (int e = 0; e < length; e++) {
+                        for (Nd4jLong e = 0; e < length; e++) {
                             sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[e]) : simdOps::Multiply<T, T, T>::op(sum, x[e]);
 
                             if (!exclusive)
@@ -81,7 +81,7 @@ namespace nd4j {
                     }
                     else {
 
-                        for (int e = 0; e < length; e++) {
+                        for (Nd4jLong e = 0; e < length; e++) {
 
                             auto xOffset = shape::getIndexOffset(e, xShapeInfo);
                             auto zOffset = shape::getIndexOffset(e, zShapeInfo);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
index 9e1980e54..8daccaeac 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
@@ -43,8 +43,8 @@ namespace helpers {
         T const* vBuf = v.getDataBuffer()->primaryAsT<T>();
         T* resBuf = res.dataBuffer()->primaryAsT<T>();
         auto interloop = PRAGMA_THREADS_FOR_2D {
-            for (int i = start_x; i < n; i += inc_x)
-                for (int j = start_y; j < n; j += inc_y)
+            for (auto i = start_x; i < n; i += inc_x)
+                for (auto j = start_y; j < n; j += inc_y)
                     resBuf[i * n + j] = -2 * vBuf[i] * vBuf[j] + (i == j ? T(1) : T(0));
         };
 
@@ -63,7 +63,7 @@ namespace helpers {
         NDArray z = *matrix;
         NDArray e('c', {M}, DataTypeUtils::fromT<T>()); // two internal buffers and scalar for squared norm
 
-        for (auto k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
+        for (Nd4jLong k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
             e.nullify();
             z = matrixMinor<T>(z, k); // minor computing for current column with given matrix z (initally is a input matrix)
 //            z.printIndexedBuffer("Minor!!!");
@@ -87,7 +87,7 @@ namespace helpers {
         }
         resQ.assign(q[0]); //
 //        MmulHelper::matmul(&q[0], matrix, &resR, false, false);
-        for (int i = 1; i < N && i < M - 1; i++) {
+        for (Nd4jLong i = 1; i < N && i < M - 1; i++) {
             auto tempResQ = resQ;
             MmulHelper::matmul(&q[i], &resQ, &tempResQ, false, false); // use mmulMxM?
             resQ = std::move(tempResQ);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
index ad04db307..e8f37f31c 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
@@ -57,10 +57,10 @@ namespace helpers {
         T* outputBuf = output->dataBuffer()->primaryAsT<T>();
 
         PRAGMA_OMP_PARALLEL_FOR
-        for (auto k = 0; k < shift; k++) {
+        for (Nd4jLong k = 0; k < shift; k++) {
             auto pos = k * step;
             auto u = rng.relativeT<T>(k, 0., 1.);
-            for (auto e = 0; e < step; e++)
+            for (Nd4jLong e = 0; e < step; e++)
                     if (directOutput) {
                         outputBuf[pos + e] = math::nd4j_igamma<T, T, T>(copyAlpha->t<T>(e),
                                                                         beta != nullptr ? copyBeta->t<T>(e) * u : u);
@@ -104,10 +104,10 @@ namespace helpers {
         bool directLa = lambda->ews() == 1 && lambda->ordering() == 'c';
         bool directOut = output->ews() == 1 && output->ordering() == 'c';
         PRAGMA_OMP_PARALLEL_FOR
-        for (auto k = 0; k < shift; k++) {
+        for (Nd4jLong k = 0; k < shift; k++) {
             auto pos = k * step;
             auto u = rng.relativeT<T>(k, 0., 1.);
-            for (auto e = 0; e < step; e++) {
+            for (Nd4jLong e = 0; e < step; e++) {
                 auto p = math::nd4j_exp<T, T>(-lambda->t<T>(e));
                 auto s = p;
                 auto x = T(0.f);
@@ -143,7 +143,7 @@ namespace helpers {
             RandomLauncher::fillUniform(context, rng, output, minVal, maxVal);
         else {
             PRAGMA_OMP_PARALLEL_FOR
-            for (auto i = 0; i < output->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < output->lengthOf(); i++) {
                 output->t<T>(i) = rng.relativeT<T>(i, minVal, maxVal);
             }
         }
@@ -184,7 +184,7 @@ namespace helpers {
 
                         auto nSamplesPerBatch = nBatchIndex * numOfClassX * numOfSamples;
                         auto nClassesPerSample = nSampleIndexInBatch * numOfClassX;
-                        for (auto nClass = 0; nClass < numOfClassX; nClass += 1) {
+                        for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass += 1) {
                             auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass;
                             auto unifornLog = nd4j::math::nd4j_log<Tx, Tx>(-nd4j::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal)));
                             Tx tValue = (xTad[nClass * xDimAstride] - unifornLog);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
index c39e28928..9fb2281b0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
@@ -50,7 +50,7 @@ namespace helpers {
             width = lastDim;
         }
 
-        for (int i = 0; i < input->lengthOf(); i += lastDim) {
+        for (Nd4jLong i = 0; i < input->lengthOf(); i += lastDim) {
             for (Nd4jLong k = startPos; k < width && pos < output->lengthOf(); k++) {
                 output->p(pos++, input->e<T>(i + k));
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
index 8bfc1ca1a..f61f1a1cf 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
@@ -110,7 +110,7 @@ namespace helpers {
             }
             else {
                 std::vector<int> dims(source->rankOf() - axe - 1);
-                for (int i = 0; i < dims.size(); ++i)
+                for (size_t i = 0; i < dims.size(); ++i)
                     dims[i] = axe + 1 + i;
 
                 ResultSet listOfTensors = source->allTensorsAlongDimension({dims});
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index 09a628b84..3f2c5d02f 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -55,9 +55,9 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
 
     // loop through output array
     auto func = PRAGMA_THREADS_FOR_3D {
-        for (uint b = start_x; b < stop_x; b += inc_x) {
-            for (uint h = start_y; h < stop_y; h += inc_y) {
-                for (uint w = start_z; w < stop_z; w += inc_z) {
+        for (auto b = start_x; b < stop_x; b += inc_x) {
+            for (auto h = start_y; h < stop_y; h += inc_y) {
+                for (auto w = start_z; w < stop_z; w += inc_z) {
                     for (uint c = 0; c < iC; ++c) {
                         const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
                         const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
@@ -146,11 +146,11 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
 
     std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
 
-    int i;
+    uint i;
     for(i = 0; i < numOfSpatialDims; ++i)
         temp[i] = blockShape.e<Nd4jLong>(i);
     temp[i++] = output.sizeAt(0);
-    for(int j = 1; j < rank; ++i, ++j)
+    for(uint j = 1; j < rank; ++i, ++j)
         temp[i] = input.sizeAt(j);
 
     NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
@@ -163,7 +163,7 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
         temp[2*i - 1] = numOfSpatialDims + i;
         temp[2*i]     = i - 1;
     }
-    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
+    for(i = 2 * numOfSpatialDims + 1; i < static_cast<uint>(temp.size()); ++i)
         temp[i] = i;
 
     inputRearranged0.permutei(temp);
@@ -216,8 +216,8 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
 
     // loop through output array
     auto func = PRAGMA_THREADS_FOR_2D {
-        for (uint b = start_x; b < stop_x; b += inc_x) {
-            for (uint h = start_y; h < stop_y; h += inc_y) {
+        for (auto b = start_x; b < stop_x; b += inc_x) {
+            for (auto h = start_y; h < stop_y; h += inc_y) {
                 for (uint w = 0; w < oW; ++w) {
                     for (uint c = 0; c < iC; ++c) {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
index 1679557af..06833d6b3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@@ -87,7 +87,7 @@ namespace helpers {
         if (input->isVector()) {
             T val = input->e<T>(0);
 
-            for (int e = 1; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
                 if (idx == indices->e<Nd4jLong>(e)) {
                    // min
                    val = nd4j::math::nd4j_min<T>(val, input->t<T>(e));
@@ -115,7 +115,7 @@ namespace helpers {
             for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<Nd4jLong>(i) == idx) {
 
-                    for (int e = 0; e < minT->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < minT->lengthOf(); e++) {
                        minT->p(e, nd4j::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e)));
                     }
                 }
@@ -138,7 +138,7 @@ namespace helpers {
             T val = T(0.f);
             int count = 0;
 
-            for (int e = 0; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                 if (idx == indices->e<int>(e)) {
                    // mean
                    val += input->e<T>(e);
@@ -166,7 +166,7 @@ namespace helpers {
             auto meanV = meanT->dup();
             meanV.assign(listOfTensors.at(0));
 
-            for (int i = 1; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
                     auto func = PRAGMA_THREADS_FOR {
                         for (auto e = start; e < stop; e++) {
@@ -198,7 +198,7 @@ namespace helpers {
         if (input->isVector()) {
             T val = T(0.f);
             int count = 0;
-            for (int e = 0; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                 if (idx == indices->e<int>(e)) {
                    // sum
                    val += input->t<T>(e);
@@ -220,7 +220,7 @@ namespace helpers {
             std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
             auto sumT = listOfOutTensors.at(idx);
 
-            for (int i = 0; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i) == idx) {
                     auto func = PRAGMA_THREADS_FOR {
                         for (auto e = start; e < stop; e++) {
@@ -248,7 +248,7 @@ namespace helpers {
             T val = input->e<T>(0);
             int count = 0;
 
-            for (int e = 1; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
                 if (idx == indices->e<int>(e)) {
                    // sum
                    val *= input->e<T>(e);
@@ -269,7 +269,7 @@ namespace helpers {
             int numOfClasses = output->sizeAt(0); // number of classes
             auto sumT = listOfOutTensors.at(idx);
             sumT->assign(listOfTensors.at(0));
-            for (int i = 1; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
                 if (indices->e<int>(i)  == idx) {
                     auto func = PRAGMA_THREADS_FOR {
                         for (auto e = start; e < stop; e++) {
@@ -313,7 +313,7 @@ namespace helpers {
 
     bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) {
         auto val = indices->e(0);
-        for (int e = 1; e < indices->lengthOf(); e++) {
+        for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
             output = indices->e(e);
             if (val.e<Nd4jLong>(0) > output.e<Nd4jLong>(0))
                 return false;
@@ -362,7 +362,7 @@ namespace helpers {
 
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 T val = input->e<T>(fi->second.at(0));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
                     val = nd4j::math::nd4j_max(val, input->e<T>(fi->second.at(idx)));
                 }
                 output->p(fi->first, val);
@@ -380,7 +380,7 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 auto outputT = listOfOutTensors.at(fi->first);
                 outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
                     auto maxT = listOfTensors.at(fi->second.at(idx));
                     for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
                         T val = nd4j::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e));
@@ -432,7 +432,7 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 auto outputT = listOfOutTensors.at(fi->first);
                 outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                     auto minT = listOfTensors.at(fi->second.at(idx));
 
                     for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
@@ -560,7 +560,7 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 auto outputT = listOfOutTensors.at(fi->first);
                 outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                     auto current = listOfTensors.at(fi->second.at(idx));
 
                     *outputT *= *current;
@@ -584,7 +584,7 @@ namespace helpers {
         if (input->isVector()) { // 1D case
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 double sumValue = input->e<double>(fi->second.at(0));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                     sumValue += input->e<double>(fi->second.at(idx));
                 }
                 output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size()));
@@ -599,7 +599,7 @@ namespace helpers {
             for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                 auto outputT = listOfOutTensors.at(fi->first);
                 outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                     auto current = listOfTensors.at(fi->second.at(idx));
                     *outputT += *current;
                 }
@@ -651,7 +651,7 @@ namespace helpers {
                     auto currentOut = listOfOutTensors.at(i);
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
-                    for (uint64_t e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                         if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
                             currentOut->p(e, currentGradOut->e<T>(e));
                     }
@@ -703,7 +703,7 @@ namespace helpers {
                     auto currentOut = listOfOutTensors.at(i);
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
-                    for (int e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                         if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) <
                             1.e-5)
                             currentOut->p(e, currentGradOut->e<double>(e));
@@ -746,13 +746,13 @@ namespace helpers {
 
             int pos = 0;
             //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
-                    for (int e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                         currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum));
                     }
                 }
@@ -781,7 +781,7 @@ namespace helpers {
             ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
 
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -817,7 +817,7 @@ namespace helpers {
             //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
 
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -860,7 +860,7 @@ namespace helpers {
             ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
             ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
 
-            for (int i = 0; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                 Nd4jLong classNum = indices->e<Nd4jLong>(i);
                 NDArray* current = listOfTensors.at(i);
                 NDArray* currentOut = listOfOutTensors.at(i);
@@ -905,13 +905,13 @@ namespace helpers {
             ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
 
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
                     auto currentGradOut = listOfGradOuts.at(classNum);
 
-                    for (int e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                         if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
                             currentOut->t<T>(e) = currentGradOut->t<T>(e);
                     }
@@ -955,7 +955,7 @@ namespace helpers {
             ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
             ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
 
-            for (int i = 0; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                 Nd4jLong classNum = indices->e<Nd4jLong>(i);
                 NDArray* current = listOfTensors.at(i);
                 NDArray* currentOut = listOfOutTensors.at(i);
@@ -984,7 +984,7 @@ namespace helpers {
             ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
 
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto currentOut = listOfOutTensors.at(i);
                     auto currentGradOut = listOfGradOuts.at(classNum);
@@ -1021,7 +1021,7 @@ namespace helpers {
             ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
 
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
@@ -1053,7 +1053,7 @@ namespace helpers {
         // if input is a vector: (as if in doc sample)
         if (input->isVector()) {
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto e = 0; e < indices->lengthOf(); e++) {
+                for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                     auto classNum = indices->e<Nd4jLong>(e);
                     output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
                 }
@@ -1069,7 +1069,7 @@ namespace helpers {
             ResultSet listOfOutTensors  =output->allTensorsAlongDimension(restDims);
 
             //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                     auto classNum = indices->e<Nd4jLong>(i);
                     auto current = listOfTensors.at(i);
                     auto currentOut = listOfOutTensors.at(i);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
index 05353bf5e..3c3db8139 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@@ -378,7 +378,7 @@ namespace nd4j {
                                 int irow = 0;
                                 auto cShift = t * idxShift;
 
-                                for (int e = 0; e < hsRounds; e++) {
+                                for (Nd4jLong e = 0; e < hsRounds; e++) {
                                     irow = bIndices[e + cShift];
                                     if (irow < 0 || irow >= vocabSize)
                                         continue;
@@ -457,7 +457,7 @@ namespace nd4j {
                     T sneu1[600];
                     T sneu1e[600];
 
-                    for (int e = start; e < stop; e++) {
+                    for (auto e = start; e < stop; e++) {
                         T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
                         T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
 
@@ -500,7 +500,7 @@ namespace nd4j {
 
                         // hierarchic softmax step
                         if (!indices.isEmpty()) {
-                            for (int i = 0; i < numIndices; i++) {
+                            for (Nd4jLong i = 0; i < numIndices; i++) {
                                 const int cIndex = bIndices[(e * numIndices) + i];
                                 const int cCode = bCodes[(e * numIndices) + i];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
index c8774f028..63c7758dc 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
@@ -41,8 +41,8 @@ namespace helpers {
 
         auto batchLoop = PRAGMA_THREADS_FOR {
             for (auto batch = start; batch < stop; batch++) {
-                for (auto r = 0; r < rows; r++) {
-                    for (auto c = 0; c < r; c++) {
+                for (Nd4jLong r = 0; r < rows; r++) {
+                    for (Nd4jLong c = 0; c < r; c++) {
                         math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
                     }
                 }
@@ -66,7 +66,7 @@ namespace helpers {
         auto permutationsPart = permutations.allTensorsAlongDimension({-1});
 
         for (auto batch = 0; batch < permutationsPart.size(); ++batch) {
-            for (auto row = 0; row < PPart[batch]->rows(); ++row) {
+            for (Nd4jLong row = 0; row < PPart[batch]->rows(); ++row) {
                 PPart[batch]->t<T>(row, permutationsPart[batch]->t<int>(row)) = T(1.f);
             }
         }
@@ -77,7 +77,7 @@ namespace helpers {
         MmulHelper::matmul(&P, rightInput, &rightPermuted, 0, 0);
         ResultSet leftLowerPart = leftLower.allTensorsAlongDimension({-2, -1});
         for (auto i = 0; i < leftLowerPart.size(); i++) {
-            for (auto r = 0; r < leftLowerPart[i]->rows(); r++)
+            for (Nd4jLong r = 0; r < leftLowerPart[i]->rows(); r++)
                 leftLowerPart[i]->t<T>(r,r) = (T)1.f;
         }
         // stage 2: triangularSolveFunctor for Lower with given b
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
index 5c9c2bbf7..b648c2b82 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
@@ -29,7 +29,7 @@ namespace helpers {
             //////////////////////////////////////////////////////////////////////////
             template <typename T>
             static void split_(const NDArray& input, const std::vector<NDArray*>& outArrs, const int axis) {
-                int numSplits = outArrs.size();
+                uint numSplits = outArrs.size();
 
                 const auto sizeofT = input.sizeOfT();
 
@@ -73,9 +73,9 @@ namespace helpers {
 
                 if (luckCase2) {
 
-                    const uint xDim = input.sizeAt(axis);
+                    const auto xDim = input.sizeAt(axis);
 
-                    for (uint i = 0; i < input.lengthOf() / xDim; ++i) {
+                    for (Nd4jLong i = 0; i < input.lengthOf() / xDim; ++i) {
 
                         T* x = xBuff + xDim * i;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
index c4b45b398..e50b18cd6 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@@ -39,7 +39,7 @@ namespace helpers {
 //        }
 // ----------------------------------------------------------------------------------------------- //
         std::vector<int> dimsToExclude(input->rankOf() - 1);
-        for (int d = 0; d < dimsToExclude.size(); ++d)
+        for (size_t d = 0; d < dimsToExclude.size(); ++d)
             dimsToExclude[d] = d;
 
         const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude);
@@ -72,7 +72,7 @@ namespace helpers {
                     NDArray topValues = NDArrayFactory::create<T>('c', {k});
                     NDArray sortedVals = NDArrayFactory::create<T>('c', {k});
                     NDArray topIndices = NDArrayFactory::create<Nd4jLong>('c', {k});
-                    for (Nd4jLong pos = 0; pos < k; ++pos) {
+                    for (uint pos = 0; pos < k; ++pos) {
                         topIndices.t<Nd4jLong>(pos) = pos;
                         topValues.t<T>(pos) = trial.t<T>(pos);
                     }
@@ -80,7 +80,7 @@ namespace helpers {
                     sortedVals.assign(topValues);// = NDArrayFactory::create<T>('c', {k});
                     //std::sort(sortedVals.begin(), sortedVals.end()); // sorted in ascending order
                     SpecialMethods<T>::sortGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), false);
-                    for (int i = k; i < width; ++i) {
+                    for (Nd4jLong i = static_cast<Nd4jLong>(k); i < width; ++i) {
                         T val = trial.e<T>(i);
                         T minTopVal = sortedVals.t<T>(0);
                         if (minTopVal < val) { // value should be inserted to top k
@@ -104,15 +104,15 @@ namespace helpers {
                     if (needSort) {
                         SpecialMethods<T>::sortGeneric(topValues.buffer(), topValues.shapeInfo(), true);
 
-                        for (int j = 0; j < width; j++)
-                            for (int pos = 0; pos < k; ++pos)
+                        for (Nd4jLong j = 0; j < width; j++)
+                            for (uint pos = 0; pos < k; ++pos)
                                 if (topValues.t<T>(pos) == trial.t<T>(j))
                                     topIndices.t<Nd4jLong>(pos) = j;
                     }
                     else { // else sort by indices
                         std::map<Nd4jLong, T> sortValsMap;
                         //std::vector<std::pair<int, T>> data(topValues.lengthOf());
-                        for (size_t e = 0; e < topValues.lengthOf(); ++e) {
+                        for (Nd4jLong e = 0; e < topValues.lengthOf(); ++e) {
                             sortValsMap[topIndices.t<Nd4jLong>(e)] = topValues.t<T>(e);
                         }
 
@@ -152,7 +152,7 @@ namespace helpers {
                 auto func = PRAGMA_THREADS_FOR {
                     for (auto e = start; e < stop; e++) {
                         bool found = false;
-                        for (int j = 0; j < k; j++) {
+                        for (uint j = 0; j < k; j++) {
                             if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
                                 found = true;
                                 break;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index 1f630e8e0..f0b3a3a25 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -597,7 +597,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
                 zCoordStart[yRank - 1] = coordToRestore;
 
             // construct coordinates for x
-            for (uint j = 0; j < yLastDim; ++j)
+            for (int j = 0; j < yLastDim; ++j)
                 xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
 
             const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
@@ -628,7 +628,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
 
     if (indices != nullptr) {
 
-        for(int i = 0; i < indices->lengthOf(); ++i)
+        for(Nd4jLong i = 0; i < indices->lengthOf(); ++i)
             if(indices->e<Nd4jLong>(i) >= input->sizeAt(axis))
                 throw std::runtime_error("helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !");
 
@@ -733,7 +733,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
     // increasing counter to skip numIndices
     e++;
     std::vector<int> indices;
-    for (; e < intArgs->size(); e++)
+    for (; e < static_cast<Nd4jLong>(intArgs->size()); e++)
         indices.push_back((*intArgs)[e]);
 
     auto func = PRAGMA_THREADS_FOR {
@@ -813,7 +813,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
             T max = -DataTypeUtils::max<T>();
             Nd4jLong idx = 0;
 
-            for (int i = 0; i < numArgs; i++) {
+            for (Nd4jLong i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
                 if (v > max) {
                     max = v;
@@ -841,7 +841,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto func = PRAGMA_THREADS_FOR {
         for (auto e = start; e < stop; e++) {
             T max = -DataTypeUtils::max<T>();
-            for (int i = 0; i < numArgs; i++) {
+            for (Nd4jLong i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
                 if (v > max)
                     max = v;
@@ -867,7 +867,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto func = PRAGMA_THREADS_FOR {
         for (auto e = start; e < stop; e++) {
             T sum = 0.;
-            for (int i = 0; i < numArgs; i++) {
+            for (Nd4jLong i = 0; i < numArgs; i++) {
                 T v = inArrs[i]->e<T>(e);
                 sum += v;
             }
@@ -893,7 +893,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
     auto func = PRAGMA_THREADS_FOR {
         for (auto e = start; e < stop; e++) {
             T sum = (T) 0.f;
-            for (int i = 0; i < numArgs; i++)
+            for (Nd4jLong i = 0; i < numArgs; i++)
                 sum += inArrs[i]->e<T>(e);
 
             output.p(e, sum);
@@ -1242,7 +1242,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
         memset(gradIBuff, 0, gradILen * sizeof(T));
     else {
         //PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int i = 0; i < gradILen * gradIEWS; i += gradIEWS)
+        for (Nd4jLong i = 0; i < gradILen * gradIEWS; i += gradIEWS)
             gradIBuff[i] = static_cast<T>(0.f);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
index c825a8fee..04508dcf8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
@@ -43,10 +43,10 @@ namespace helpers {
         auto rows = leftInput->rows();
         auto cols = rightInput->columns();
         //output->t<T>(0,0) = rightInput->t<T>(0,0) / leftInput->t<T>(0,0);
-        for (auto r = 0; r < rows; r++) {
-            for (auto j = 0; j < cols; j++) {
+        for (Nd4jLong r = 0; r < rows; r++) {
+            for (Nd4jLong j = 0; j < cols; j++) {
                 auto sum = rightInput->t<T>(r, j);
-                for (auto c = 0; c < r; c++) {
+                for (Nd4jLong c = 0; c < r; c++) {
                     sum -= leftInput->t<T>(r, c) * output->t<T>(c, j);
                 }
                 output->t<T>(r, j) = sum / leftInput->t<T>(r, r);
@@ -72,10 +72,10 @@ namespace helpers {
     static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
         auto rows = leftInput->rows();
         auto cols = rightInput->columns();
-        for (auto r = rows; r > 0; r--) {
-            for (auto j = 0; j < cols; j++) {
+        for (Nd4jLong r = rows; r > 0; r--) {
+            for (Nd4jLong j = 0; j < cols; j++) {
                 auto sum = rightInput->t<T>(r - 1, j);
-                for (auto c = r; c < rows; c++) {
+                for (Nd4jLong c = r; c < rows; c++) {
                     sum -= leftInput->t<T>(r - 1, c) * output->t<T>(c, j);
                 }
                 output->t<T>(r - 1, j) = sum / leftInput->t<T>(r - 1, r - 1);
@@ -114,14 +114,14 @@ namespace helpers {
         auto batchLoop = PRAGMA_THREADS_FOR {
             for (auto batch = start; batch < stop; batch++) {
                 if (!lower) {
-                    for (auto r = 0; r < rows; r++) {
-                        for (auto c = 0; c <= r; c++) {
+                    for (Nd4jLong r = 0; r < rows; r++) {
+                        for (Nd4jLong c = 0; c <= r; c++) {
                             outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
                         }
                     }
                 } else {
-                    for (auto r = 0; r < rows; r++) {
-                        for (auto c = r; c < cols; c++) {
+                    for (Nd4jLong r = 0; r < rows; r++) {
+                        for (Nd4jLong c = r; c < cols; c++) {
                             outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
                         }
                     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
index 2d4bfee21..2dd936b09 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
@@ -26,7 +26,7 @@ namespace helpers {
 
     template <typename T>
     static void adjustWeights_(NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
-            for (int e = 0; e < input->lengthOf(); e++) {
+            for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
                 int val = input->e<int>(e);
                 if (val < maxLength) {
                     if (weights != nullptr)