Loops auto-vectorization problem fix (#274)

* libnd4j cast loop types Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more type castination added to loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j sync casting types of iterated variable in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j more loops reviewed for vectorization problem fix Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j fixed several typos Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j several more files reviewed to fix auto-vectorization problem in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j merge master and reviewed more files to fix auto-vectorization problem in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j several type casting added in broadcasting that were missed, fixed mac builds Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j double check all files and fix several more places in loops Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j fixed builds Signed-off-by: Oleg <oleg.semeniv@gmail.com> * libnd4j revert changes for lup.cpp Signed-off-by: Oleg <oleg.semeniv@gmail.com>
2020-02-26 20:12:19 +02:00 · 2020-02-26 20:12:19 +02:00 · b4575d11e9
commit b4575d11e9
parent 5c806d2fb5
48 changed files with 1084 additions and 1084 deletions
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
--- a/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
+++ b/libnd4j/include/loops/cpu/TrueBroadcastHelper.hpp
@ -50,12 +50,12 @@ namespace nd4j {
                1 == zArr.ews() && 'c' == zArr.ordering());
            if (bSpecialCase && yArr.isColumnVector() && 1 == xArr.sizeAt(-1) ) {
-                auto yLen = (uint32_t)yArr.lengthOf();
+                auto yLen = yArr.lengthOf();
                auto func = PRAGMA_THREADS_FOR{
-                   for (uint32_t i = start; i < stop; i++) {
+                   for (auto i = start; i < stop; i++) {
                       auto rZ = z + (i * yLen);
                       auto v = x[i];
-                       for (uint32_t j = 0; j < yLen; j++) {
+                       for (Nd4jLong j = 0; j < yLen; j++) {
                            rZ[j] = OpType::op(v, y[j]);
                       }
                   }
@ -74,13 +74,13 @@ namespace nd4j {
            if (bSpecialCase && bSpecialCase2) {
-                int zDim1 = zArr.sizeAt(-2);
+                uint32_t zDim1 = zArr.sizeAt(-2);
-                int zDim2 = zArr.sizeAt(-1);
+                uint32_t zDim2 = zArr.sizeAt(-1);
-                int nLen = zArr.lengthOf() / yArr.sizeAt(-1);
+                uint32_t nLen = zArr.lengthOf() / yArr.sizeAt(-1);
                auto func = PRAGMA_THREADS_FOR{
-                     for (uint32_t total = start; total < stop; total++) {
+                     for (auto total = start; total < stop; total++) {
                        uint32_t i = total / zDim1;
                        uint32_t j = total % zDim1;
@ -127,7 +127,7 @@ namespace nd4j {
                                  yCoords[iy--] = 0;
                              }
                          }
-                        }
+                    }
                        const auto xOffset = shape::getOffset(xShapeInfo, xCoords.data());
                        const auto yOffset = shape::getOffset(yShapeInfo, yCoords.data());
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@ -184,7 +184,7 @@ namespace functions {
                        const auto oX = x[i];
                        PRAGMA_OMP_SIMD
-                        for (unsigned int f = 0; f < loopLength; f++)
+                        for (Nd4jLong f = 0; f < loopLength; f++)
                            oZ[f] = OpType::op(oX, oY[f]);
                    }
                } else if(kindOfLoop == nd4j::LoopKind::BROADCAST_SCALAR_Y){
@ -198,7 +198,7 @@ namespace functions {
                        const auto oY = y[i];
                        PRAGMA_OMP_SIMD
-                        for (unsigned int f = 0; f < loopLength; f++)
+                        for (Nd4jLong f = 0; f < loopLength; f++)
                            oZ[f] = OpType::op(oX[f], oY);
                    }
                }
@ -213,14 +213,14 @@ namespace functions {
                    Nd4jLong  yStrides[3] = { 0,0,0 };
                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
-                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
-                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+                    uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
-                    for (uint32_t index0 = start; index0 < stop; index0++) {
+                    for (auto index0 = start; index0 < stop; index0++) {
                        PRAGMA_OMP_SIMD
-                            for (uint32_t index1 = 0; index1 < nSize1; index1++) {
+                            for (uint64_t index1 = 0; index1 < nSize1; index1++) {
-                                for (uint32_t index2 = 0; index2 < nSize2; index2++) {
+                                for (uint64_t index2 = 0; index2 < nSize2; index2++) {
                                    auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2);
                                    auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2);
                                    auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2);
@ -242,18 +242,18 @@ namespace functions {
                    Nd4jLong  yStrides[4] = { 0,0,0,0 };
                    nd4j::ShapeUtils::copyCertainStridesFromShapeInfo(yShapeInfo, xRank, dimensionLength, dimension, yStrides);
-                    uint32_t nSize1 = shape::sizeAt(zShapeInfo, 1);
+                    uint64_t nSize1 = shape::sizeAt(zShapeInfo, 1);
-                    uint32_t nSize2 = shape::sizeAt(zShapeInfo, 2);
+                    uint64_t nSize2 = shape::sizeAt(zShapeInfo, 2);
-                    uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
+                    uint64_t nSize3 = shape::sizeAt(zShapeInfo, 3);
-                    for (uint32_t i = start; i < stop; i++) {
+                    for (auto i = start; i < stop; i++) {
-                        uint32_t index0 = i / nSize1;
+                        uint64_t index0 = i / nSize1;
-                        uint32_t index1 = i % nSize1;
+                        uint64_t index1 = i % nSize1;
                        PRAGMA_OMP_SIMD
-                            for (uint32_t index2 = 0; index2 < nSize2; index2++) {
+                            for (uint64_t index2 = 0; index2 < nSize2; index2++) {
-                                for (uint32_t index3 = 0; index3 < nSize3; index3++) {
+                                for (uint64_t index3 = 0; index3 < nSize3; index3++) {
                                    auto rX = x + (xStrides[0] * index0 + xStrides[1] * index1 + xStrides[2] * index2 + xStrides[3] * index3);
                                    auto rY = y + (yStrides[0] * index0 + yStrides[1] * index1 + yStrides[2] * index2 + yStrides[3] * index3);
                                    auto rZ = z + (zStrides[0] * index0 + zStrides[1] * index1 + zStrides[2] * index2 + zStrides[3] * index3);
@ -279,7 +279,7 @@ namespace functions {
                    uint32_t nSize3 = shape::sizeAt(zShapeInfo, 3);
                    uint32_t nSize4 = shape::sizeAt(zShapeInfo, 4);
-                    for (uint32_t i = start; i < stop; i++) {
+                    for (auto i = start; i < stop; i++) {
                        uint32_t index0 = i / nSize1;
                        uint32_t index1 = i % nSize1;
@ -326,7 +326,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                            oZ[zOffset] = OpType::op(oX[offset], y[offset]);
@ -344,7 +344,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(oX[offset], y[yOffset]);
@ -362,7 +362,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(oX[xOffset], y[offset]);
@ -382,7 +382,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@ -497,7 +497,7 @@ namespace functions {
                    auto oY = y + tadOffsets[i];
                    PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                        auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                        oZ[zOffset] = OpType::op(x[offset], oY[offset]);
@ -515,7 +515,7 @@ namespace functions {
                    auto oY = y + tadOffsets[i];
                    PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                        auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
                        oZ[offset] = OpType::op(x[xOffset], oY[offset]);
@ -533,7 +533,7 @@ namespace functions {
                    auto oY = y + tadOffsets[i];
                    PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                        auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                        auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                        oZ[offset] = OpType::op(x[offset], oY[yOffset]);
@ -553,7 +553,7 @@ namespace functions {
                    auto oY = y + tadOffsets[i];
                    PRAGMA_OMP_SIMD
-                    for (int f = 0; f < tadLength; f++) {
+                    for (unsigned int f = 0; f < tadLength; f++) {
                        auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                        auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                        auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
--- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
@ -183,7 +183,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            oZ[offset] = OpType::op(oX[offset], y[offset], extraParams);
                        }
@ -200,7 +200,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                            oZ[zOffset] = OpType::op(oX[offset], y[offset], extraParams);
@ -218,7 +218,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(oX[offset], y[yOffset], extraParams);
@ -237,7 +237,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(oX[xOffset], y[offset], extraParams);
@ -257,7 +257,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@ -357,7 +357,7 @@ namespace functions {
                        auto oZ = z + zTadOffset[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(x[offset], oY[offset], extraParams);
                        }
@ -375,7 +375,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                            oZ[zOffset] = OpType::op(x[offset], oY[offset], extraParams);
@ -394,7 +394,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                            oZ[offset] = OpType::op(x[xOffset], oY[offset], extraParams);
@ -413,7 +413,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                            oZ[offset] = OpType::op(x[offset], oY[yOffset], extraParams);
@ -434,7 +434,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
--- a/libnd4j/include/loops/cpu/broadcasting_int.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp
@ -177,7 +177,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            oZ[offset] = OpType::op(oX[offset], y[offset]);
                        }
@ -194,7 +194,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                            oZ[zOffset] = OpType::op(oX[offset], y[offset]);
@ -212,7 +212,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(oX[offset], y[yOffset]);
@ -230,7 +230,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(oX[xOffset], y[offset]);
@ -250,7 +250,7 @@ namespace functions {
                        auto oX = x + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (unsigned int f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
@ -347,7 +347,7 @@ namespace functions {
                        auto oZ = z + zTadOffset[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            oZ[offset] = OpType::op(x[offset], oY[offset]);
                        }
@ -364,7 +364,7 @@ namespace functions {
                        auto oZ = z + zTadOffset[i];
                        auto oY = y + tadOffsets[i];
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                            oZ[zOffset] = OpType::op(x[offset], oY[offset]);
@ -382,7 +382,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                            oZ[offset] = OpType::op(x[xOffset], oY[offset]);
@ -400,7 +400,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                            oZ[offset] = OpType::op(x[offset], oY[yOffset]);
@ -420,7 +420,7 @@ namespace functions {
                        auto oY = y + tadOffsets[i];
                        PRAGMA_OMP_SIMD
-                        for (int f = 0; f < tadLength; f++) {
+                        for (uint f = 0; f < tadLength; f++) {
                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
--- a/libnd4j/include/loops/cpu/indexreduce.hpp
+++ b/libnd4j/include/loops/cpu/indexreduce.hpp
@ -124,7 +124,7 @@ void IndexReduce<X, Z>::exec(void *vx, Nd4jLong *xShapeInfo,
            return;
        const auto indexValue = OpType::startingIndexValue(x);
-        for (uint i = 0; i < zLen; i++)
+        for (Nd4jLong i = 0; i < zLen; i++)
            z[i] = (Z) indexValue.index;
        return;
--- a/libnd4j/include/loops/cpu/random.hpp
+++ b/libnd4j/include/loops/cpu/random.hpp
@ -93,7 +93,7 @@ namespace functions {
                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                        z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
@ -111,7 +111,7 @@ namespace functions {
                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                        auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                        z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
@ -129,7 +129,7 @@ namespace functions {
                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto offset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                        z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
@ -149,7 +149,7 @@ namespace functions {
                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto yOffset = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
@ -197,7 +197,7 @@ namespace functions {
                else{
                    auto func = PRAGMA_THREADS_FOR {
                        PRAGMA_OMP_SIMD
-                        for (uint64_t i = start; i < stop; i++)  {
+                        for (auto i = start; i < stop; i++)  {
                            auto offset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                            z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                        }
@ -213,7 +213,7 @@ namespace functions {
                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                        auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
                        auto zOffset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                        z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
@ -255,7 +255,7 @@ namespace functions {
                auto func = PRAGMA_THREADS_FOR {
                    PRAGMA_OMP_SIMD
-                    for (uint64_t i = start; i < stop; i++)  {
+                    for (auto i = start; i < stop; i++)  {
                        auto offset = shape::indexOffset(i, zShapeInfo, zShapeInfoCast, canCastZ);
                        z[offset] = OpClass::op(i, length, rng, extraArguments);
                    }
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@ -55,7 +55,7 @@ namespace functions {
                    return;
                const auto startingVal = OpType::startingValue(x);
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                    z[i] = startingVal;
                return;
            }
@ -68,7 +68,7 @@ namespace functions {
                uint xShapeInfoCast[MAX_RANK];
                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                for (auto i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                    startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
                z[0] = OpType::postProcess(startingValue, length, extraParams);
@ -94,7 +94,7 @@ namespace functions {
                    uint xShapeInfoCast[MAX_RANK];
                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
                    return OpType::postProcess(startingValue, length, extraParams);
@ -156,7 +156,7 @@ namespace functions {
                        return;
                    const auto startingVal = OpType::startingValue(x);
-                    for (uint i = 0; i < resultLength; i++)
+                    for (Nd4jLong i = 0; i < resultLength; i++)
                        z[i] = startingVal;
                    return;
                }
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.hpp
@ -59,7 +59,7 @@ namespace functions {
                    return;
                const auto startingVal = OpType::startingValue(x);
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                    z[i] = startingVal;
                return;
@ -113,7 +113,7 @@ namespace functions {
                    uint xShapeInfoCast[MAX_RANK];
                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
                    return OpType::postProcess(startingValue, length, extraParams);
@ -184,7 +184,7 @@ namespace functions {
                        return;
                    const auto startingVal = std::is_same<OpType, simdOps::Mean<X,Z>>::value ? nd4j::DataTypeUtils::nanOrZero<Z>() : static_cast<Z>(OpType::startingValue(x));
-                    for (uint i = 0; i < resultLength; i++)
+                    for (Nd4jLong i = 0; i < resultLength; i++)
                        z[i] = startingVal;
                    return;
                }
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@ -55,7 +55,7 @@ namespace functions {
                    return;
                const auto startingVal = OpType::startingValue(x);
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                    z[i] = startingVal;
                return;
            }
@ -110,7 +110,7 @@ namespace functions {
                    uint xShapeInfoCast[MAX_RANK];
                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
                    return OpType::postProcess(startingValue, length, extraParams);
@ -173,7 +173,7 @@ namespace functions {
                        return;
                    const auto startingVal = OpType::startingValue(x);
-                    for (uint i = 0; i < resultLength; i++)
+                    for (Nd4jLong i = 0; i < resultLength; i++)
                        z[i] = startingVal;
                    return;
                }
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@ -57,7 +57,7 @@ namespace functions {
                    return;
                const auto startingVal = OpType::startingValue(x);
-                for (uint i = 0; i < length; i++)
+                for (Nd4jLong i = 0; i < length; i++)
                    z[i] = startingVal;
                return;
            }
@ -111,7 +111,7 @@ namespace functions {
                    uint xShapeInfoCast[MAX_RANK];
                    bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                    for (auto i = 0; i < length; i++)
+                    for (Nd4jLong i = 0; i < length; i++)
                        startingValue = OpType::update(startingValue, OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
                    return OpType::postProcess(startingValue, length, extraParams);
@ -182,7 +182,7 @@ namespace functions {
                        return;
                    const auto startingVal = OpType::startingValue(x);
-                    for (uint i = 0; i < zLength; i++)
+                    for (Nd4jLong i = 0; i < zLength; i++)
                        z[i] = startingVal;
                    return;
                }
--- a/libnd4j/include/loops/cpu/reduce3.hpp
+++ b/libnd4j/include/loops/cpu/reduce3.hpp
@ -53,7 +53,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
            return;
        const auto startingVal = OpType::startingValue(x);
-        for (uint i = 0; i < length; i++)
+        for (Nd4jLong i = 0; i < length; i++)
            z[i] = startingVal;
        return;
--- a/libnd4j/include/loops/cpu/scalar.hpp
+++ b/libnd4j/include/loops/cpu/scalar.hpp
@ -73,7 +73,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
            auto oX = x + xTadOffsets[r];
            PRAGMA_OMP_SIMD
-            for (unsigned int f = 0; f < tadLength; f++)
+            for (int f = 0; f < tadLength; f++)
                oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
        };
    }
@ -83,7 +83,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
            auto oX = x + xTadOffsets[r];
            PRAGMA_OMP_SIMD
-            for (unsigned int f = 0; f < tadLength; f++)
+            for (int f = 0; f < tadLength; f++)
                oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
        };
    }
--- a/libnd4j/include/loops/cpu/scalar_bool.cpp
+++ b/libnd4j/include/loops/cpu/scalar_bool.cpp
@ -74,7 +74,7 @@ namespace functions {
                    auto oX = x + xTadOffsets[r];
                    PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                        oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
                };
            }
@ -84,7 +84,7 @@ namespace functions {
                    auto oX = x + xTadOffsets[r];
                    PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                        oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
                };
            }
--- a/libnd4j/include/loops/cpu/scalar_int.cpp
+++ b/libnd4j/include/loops/cpu/scalar_int.cpp
@ -74,7 +74,7 @@ namespace functions {
                    auto oX = x + xTadOffsets[r];
                    PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                        oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
                };
            }
@ -84,7 +84,7 @@ namespace functions {
                    auto oX = x + xTadOffsets[r];
                    PRAGMA_OMP_SIMD
-                    for (unsigned int f = 0; f < tadLength; f++)
+                    for (int f = 0; f < tadLength; f++)
                        oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
                };
            }
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@ -91,7 +91,7 @@ namespace functions {
            uint xShapeInfoCast[MAX_RANK];
            const bool canCast = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
-            for (uint64_t i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
                SummaryStatsData<X> curr;
@ -116,7 +116,7 @@ namespace functions {
            auto x = reinterpret_cast<X *>(vx);
            auto z = reinterpret_cast<Z *>(vz);
            auto extraParams = reinterpret_cast<Z *>(vextraParams);
-            int resultLength = shape::length(zShapeInfo);
+            auto resultLength = shape::length(zShapeInfo);
            if(nd4j::ArrayOptions::arrayType(xShapeInfo) == nd4j::ArrayType::EMPTY) {
               if(nd4j::ArrayOptions::arrayType(zShapeInfo) == nd4j::ArrayType::EMPTY)
@ -124,7 +124,7 @@ namespace functions {
                SummaryStatsData<X> comp;
                comp.initWithValue(x[0]);
-                for (uint i = 0; i < resultLength; i++)
+                for (Nd4jLong i = 0; i < resultLength; i++)
                    z[i] = OpType::getValue(biasCorrected, comp);
                return;
            }
@ -166,14 +166,14 @@ namespace functions {
                    comp.initWithValue(tx[0]);
                    if (tadEWS == 1 && tadOrder == 'c') {
-                        for (int i = 1; i < tadLength; i++) {
+                        for (Nd4jLong i = 1; i < tadLength; i++) {
                            SummaryStatsData <X> indexVal2;
                            indexVal2.initWithValue(tx[i]);
                            comp = update(comp, OpType::op(indexVal2, extraParams), extraParams);
                        }
                    } else {
-                        for (int i = 1; i < tadLength; i++) {
+                        for (Nd4jLong i = 1; i < tadLength; i++) {
                            auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
                            SummaryStatsData <X> indexVal2;
--- a/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/generic/nn/batchnorm.cpp
@ -61,7 +61,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
    else
        axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
-    const int numOfAxes = axes.size();
+    const uint numOfAxes = axes.size();
    REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
    // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
@ -83,7 +83,7 @@ CUSTOM_OP_IMPL(batchnorm, 3, 1, false, 1, 2) {
        REQUIRE_TRUE(beta->isSameShape(expShape), 0, "BATCHNORM op: wrong shape of beta array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(expShape).c_str(), ShapeUtils::shapeAsString(beta).c_str());
    // types of all input arrays should be the same
-    for(int i = 1; i < block.width(); ++i)
+    for(unsigned long i = 1; i < block.width(); ++i)
        REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM op: types of all input arrays should be the same !");
    nd4j_debug("MKL-DNN is not used for batchnorm!\n", 0);
@ -167,7 +167,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
    else
        axes.push_back(inRank-1);               // default dimension to reduce along is last dimension
-    const int numOfAxes = axes.size();
+    const uint numOfAxes = axes.size();
    REQUIRE_TRUE(numOfAxes <= inRank, 0, "BATCHNORM_BP op: too big number of input axes to normalize over, expected number should be less or equal to rank of input array, but got %i and %i correspondingly !", numOfAxes, inRank);
    // evaluate expected shape for mean, variance and gamma. These 3 arrays should have identical shapes
@ -191,7 +191,7 @@ CUSTOM_OP_IMPL(batchnorm_bp, 4, 3, false, 1, 2) {
    REQUIRE_TRUE(input->isSameShape(dLdO), 0, "BATCHNORM_BP op: wrong shape of output gradients array, expected is %s, but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(dLdO).c_str());
    // types of all input arrays should be the same (except dLdO)
-    for(int i = 1; i < block.width() - 2; ++i)
+    for(unsigned long i = 1; i < block.width() - 2; ++i)
        REQUIRE_TRUE(INPUT_VARIABLE(0)->dataType() == INPUT_VARIABLE(i)->dataType(), 0, "BATCHNORM_BP op: types of arrays (input, mean, variance, gamma, beta) should be the same !");
    // ***** calculations ***** //
--- a/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/BarnesHutTsne.cpp
@ -30,7 +30,7 @@ namespace helpers {
        int* pRowCounts = reinterpret_cast<int*>(rowCounts.buffer());
        int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
        int const* pCols = reinterpret_cast<int const*>(colP->getBuffer());
-        for (int n = 0; n < N; n++) {
+        for (Nd4jLong n = 0; n < N; n++) {
            int begin = pRows[n];//->e<int>(n);
            int end = pRows[n + 1];//rowP->e<int>(n + 1);
            for (int i = begin; i < end; i++) {
@ -72,7 +72,7 @@ namespace helpers {
        int const* pRows = reinterpret_cast<int const*>(rowP->getBuffer());
        int* symRowP = reinterpret_cast<int*>(outputRows->buffer());
        symRowP[0] = 0;
-        for (int n = 0; n < N; n++)
+        for (Nd4jLong n = 0; n < N; n++)
            symRowP[n + 1] = symRowP[n] + rowCounts->e<int>(n);
 //        outputRows->printBuffer("output rows");
@ -86,7 +86,7 @@ namespace helpers {
        std::vector<int> offset(N);// = NDArrayFactory::create<int>('c', {N});
 //PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(schedule(guided) shared(offset))
-        for (int n = 0; n < N; n++) {
+        for (Nd4jLong n = 0; n < N; n++) {
            int begin = pRows[n];
            int bound = pRows[n + 1];
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@ -146,17 +146,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
        auto length = shape::length(inShapeInfo);
        if (inEWS == 1) {
-            for (int i = 0; i < length; i++)
+            for (Nd4jLong i = 0; i < length; i++)
                max = nd4j::math::nd4j_max<T>(max, inBuff[i]);
            PRAGMA_OMP_SIMD_SUM(sum)
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                outBuff[i] = nd4j::math::nd4j_exp<T,T>(inBuff[i] - max);
                sum += outBuff[i];
            }
            PRAGMA_OMP_SIMD
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                outBuff[i] /= sum;
                outBuff[i] = nd4j::math::nd4j_log<T,T>(outBuff[i]);
            }
@ -164,17 +164,17 @@ void softMaxForVector(nd4j::LaunchContext * context, const NDArray& input, NDArr
        else if (inEWS > 1) {
            PRAGMA_OMP_SIMD_MAX(max)
-            for (int i = 0; i < length; i++)
+            for (Nd4jLong i = 0; i < length; i++)
                max = nd4j::math::nd4j_max<T>(max, inBuff[i * inEWS]);
            PRAGMA_OMP_SIMD_SUM(sum)
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                outBuff[i * inEWS] = nd4j::math::nd4j_exp<T,T>(inBuff[i * inEWS] - max);
                sum += outBuff[i * inEWS];
            }
            PRAGMA_OMP_SIMD
-            for (int i = 0; i < length; i++) {
+            for (Nd4jLong i = 0; i < length; i++) {
                outBuff[i * inEWS] /= sum;
                outBuff[i * inEWS] = nd4j::math::nd4j_log<T, T>(outBuff[i * inEWS]);
            }
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@ -443,7 +443,7 @@ namespace nd4j {
 					const X* bias_new;
 					X* bias_extra = nullptr;
 					size_t total_num = 1;
-					for (size_t i = 0; i < rank; i++) {
+					for (Nd4jLong i = 0; i < rank; i++) {
 						total_num *= bases[i];
 					}
 					Nd4jLong inc;
@ -574,7 +574,7 @@ namespace nd4j {
 					for (size_t i = 0; i < 2; i++) {
 						numNC *= bases[i];
 					}
-					for (size_t i = 2; i < rank; i++) {
+					for (Nd4jLong i = 2; i < rank; i++) {
 						numHW *= bases[i];
 					}
 					Nd4jLong total_num = numNC * numHW;
--- a/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/axis.cpp
@ -27,7 +27,7 @@ namespace helpers {
    void adjustAxis(Nd4jLong rank, NDArray* axisVector, std::vector<int>& output) {
        output.resize(axisVector->lengthOf());
-        for (int e = 0; e < axisVector->lengthOf(); e++) {
+        for (Nd4jLong e = 0; e < axisVector->lengthOf(); e++) {
                auto ca = axisVector->e<int>(e);
                if (ca < 0)
                    ca += rank;
@ -37,7 +37,7 @@ namespace helpers {
    }
    void adjustAxis(Nd4jLong rank, std::vector<int> &axisVector) {
-        for (int e = 0; e < axisVector.size(); e++) {
+        for (size_t e = 0; e < axisVector.size(); e++) {
            auto a = axisVector[e];
            if (a < 0)
                axisVector[e] = a + rank;
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@ -66,7 +66,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
        Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
        Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
-        for (int j = 0; j < lenSmall; ++j) {
+        for (Nd4jLong j = 0; j < lenSmall; ++j) {
            const bool isOwner = (j < info._numThreads) ? thread_id == j : thread_id == (j % info._numThreads);
@ -96,7 +96,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
                shape::outerArrayOffsets(zOffsets, j, output->getShapeInfo(), mean->getShapeInfo(), auxBuff, dimsToExclude.data());
            PRAGMA_OMP_SIMD
-            for (uint i = 0; i < steps; ++i)
+            for (Nd4jLong i = 0; i < steps; ++i)
                z[zOffsets[i]] = (x[xOffsets[i]] - meanVal) * sigmaInvGam + betaVal;
        }
--- a/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/col2im.cpp
@ -65,8 +65,8 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
            T *col, *im;
            int imRow, imCol;
-            for (uint b = start_x; b < stop_x; b += inc_x) {
+            for (auto b = start_x; b < stop_x; b += inc_x) {
-                for (uint c = start_y; c < stop_y; c += inc_y) {
+                for (auto c = start_y; c < stop_y; c += inc_y) {
                    for (int kRow = 0; kRow < kH; ++kRow) {
                        for (int kCol = 0; kCol < kW; ++kCol) {
                            for (int colH = 0; colH < oH; ++colH) {
@ -96,7 +96,7 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
        auto func = PRAGMA_THREADS_FOR {
            T *col, *im;
-            for (uint b = start; b < stop; b++) {
+            for (auto b = start; b < stop; b++) {
                T *im0 = imBuff + b * imStride0;
                T *col4 = colBuff + b * colStride0;
                for (int colH = 0; colH < oH; ++colH, col4 += colStride4) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dilation2d.cpp
@ -55,8 +55,8 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
    auto func = PRAGMA_THREADS_FOR_2D {
-        for (uint b = start_x; b < stop_x; b += inc_x) {
+        for (auto b = start_x; b < stop_x; b += inc_x) {
-            for (uint oh = start_y; oh < stop_y; oh += inc_y) {
+            for (auto oh = start_y; oh < stop_y; oh += inc_y) {
                for (uint ow = 0; ow < oW; ++ow) {
                    for (uint c = 0; c < iC; ++c) {
@ -70,7 +70,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
                                const int iw = ow * sW - pW + kw * dW;
                                if (iw < 0 || iw >= iW) continue;
-                                uint xCoords[4] = {b,  (uint)ih, (uint)iw, c};
+                                uint xCoords[4] = { static_cast<uint>(b),  static_cast<uint>(ih), static_cast<uint>(iw), c};
                                uint yCoords[3] = {kh, kw, c};
                                const X val = x[shape::getOffset(xShapeInfo, xCoords)] + y[shape::getOffset(yShapeInfo, yCoords)];
@ -79,7 +79,7 @@ static void dilation2d_(NDArray *input, NDArray *weights, NDArray *output, const
                            }
                        }
-                        uint zCoords[4] = {b,  oh, ow, c};
+                        uint zCoords[4] = { static_cast<uint>(b),  static_cast<uint>(oh), ow, c};
                        z[shape::getOffset(zShapeInfo, zCoords)] = static_cast<Z>(max);
                    }
                }
--- a/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dropout.cpp
@ -63,7 +63,7 @@ namespace helpers {
            std::vector<Nd4jLong> dims(reduceShape->lengthOf());
            bool fit = true;
-            for( int i = 0; i < dims.size(); i++ ) {
+            for(auto i = 0; i < dims.size(); i++ ) {
                if (fit) {
                    dims[i] = reduceShape->e<Nd4jLong>(i);
                    for (int e = 0; e < input->rankOf(); ++e)
--- a/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/dynamic.cpp
@ -53,7 +53,7 @@ namespace nd4j {
                        outputs[i].second = 0;
                        //PRAGMA_OMP_PARALLEL_FOR_IF(indices->lengthOf() > Environment::getInstance()->elementwiseThreshold())
-                        for (int e = 0; e < indices->lengthOf(); ++e)
+                        for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                            if ((*indices).e<Nd4jLong>(e) == i)
                                listOutForCurrent.at(outputs[i].second++)->assign(listOfTensors.at(e));
                    }
@ -65,7 +65,7 @@ namespace nd4j {
                        for (auto i = start; i < stop; i++) {
                            outputs[i].first = outputList[i];
                            outputs[i].second = 0;
-                            for (int e = 0; e < indices->lengthOf(); ++e)
+                            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                                if (indices->e<Nd4jLong>(e) == i)
                                    outputs[i].first->p(outputs[i].second++, input->e<T>(e));
                        }
@ -83,7 +83,7 @@ namespace nd4j {
                    for (int e = 0; e < numOfData; e++) {
                        auto data = inputs[e];
                        auto index = indices[e];
-                        for (int i = 0; i < index->lengthOf(); i++) {
+                        for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
                            Nd4jLong pos = index->e<Nd4jLong>(i);
                            if (pos < 0) {
                                nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
@ -100,7 +100,7 @@ namespace nd4j {
                }
                else {
                    std::vector<int> restDims(output->rankOf() - 1);
-                    for (int i = restDims.size(); i > 0;  i--)
+                    for (auto i = restDims.size(); i > 0;  i--)
                        restDims[restDims.size() - i] = output->rankOf() - i;
                    ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
@ -109,12 +109,12 @@ namespace nd4j {
                        auto data = inputs[e];
                        auto index = indices[e];
                        std::vector<int> sourceDims(data->rankOf() - index->rankOf());
-                        for (int i = sourceDims.size(); i > 0;  i--)
+                        for (auto i = sourceDims.size(); i > 0;  i--)
                            sourceDims[sourceDims.size() - i] = data->rankOf() - i;
                        ResultSet listOfTensors = data->allTensorsAlongDimension(sourceDims)    ;
-                        for (int i = 0; i < index->lengthOf(); i++) {
+                        for (Nd4jLong i = 0; i < index->lengthOf(); i++) {
                            auto pos = index->e<Nd4jLong>(i);
                            if (pos < 0) {
                                nd4j_printf("dynamic_stitch: Index value should be non-negative. But %i was given", pos);
@ -146,7 +146,7 @@ namespace nd4j {
                    ResultSet listOfTensors = outputList[0]->allTensorsAlongDimension(sourceDims);
-                    for (unsigned int i = 0; i < inputGradientList.size(); i++) {
+                    for (auto i = 0; i < inputGradientList.size(); i++) {
                        outputs[i].first = inputGradientList[i];
                        if (outputs[i].first->rankOf() < 1) continue; // skip empty gradient outs
                        std::vector<int> outDims(outputs[i].first->rankOf() - 1);
@ -158,7 +158,7 @@ namespace nd4j {
                        outputs[i].second = 0;
-                        for (int e = 0; e < indices->lengthOf(); ++e)
+                        for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                            if (indices->e<Nd4jLong>(e) == i)
                                listOfTensors.at(e)->assign(listOutForCurrent.at(outputs[i].second++));
                    }
@ -171,7 +171,7 @@ namespace nd4j {
                        for (auto i = start; i < stop; i++) {
                            outputs[i].first = inputGradientList[i];
                            outputs[i].second = 0;
-                            for (int e = 0; e < indices->lengthOf(); ++e)
+                            for (Nd4jLong e = 0; e < indices->lengthOf(); ++e)
                                if (indices->e<Nd4jLong>(e) == i)
                                    output->p<T>(e, outputs[i].first->e<T>(outputs[i].second++));
                        }
--- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
@ -45,7 +45,7 @@ namespace nd4j {
                    auto xShapeInfo = inputs[e]->shapeInfo();
                    auto xLength = inputs[e]->lengthOf();
-                    for (uint i = 0; i < xLength; i++)
+                    for (Nd4jLong i = 0; i < xLength; i++)
                        z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
                }
            }
--- a/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/hashcode.cpp
@ -26,7 +26,7 @@ namespace nd4j {
        namespace helpers {
            template <typename T>
            static void hashCode_(LaunchContext *context, NDArray &array, NDArray &result) {
-                auto blockSize = 32;
+                Nd4jLong blockSize = 32;
                auto length = array.lengthOf();
                int numBlocks = length / blockSize + ((length % blockSize == 0) ? 0 : 1);
                auto tempA = NDArrayFactory::create<Nd4jLong>('c', {numBlocks}, context);
@ -42,11 +42,11 @@ namespace nd4j {
                // we divide array into 32 element chunks, and store intermediate results once
                auto func = PRAGMA_THREADS_FOR {
-                    for (auto b = 0; b < stop; b++) {
+                    for (auto b = start; b < stop; b++) {
                        auto blockBuffer = buffer + b * numBlocks;
                        Nd4jLong r = 1;
-                        for (int e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
+                        for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < length; e++) {
                            auto v = longBytes<T>(blockBuffer[e]);
                            r = 31 * r + v;
                        }
@ -68,7 +68,7 @@ namespace nd4j {
                            auto blockBuffer = tempBuffer + b * numBlocks;
                            Nd4jLong r = 1;
-                            for (int e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
+                            for (Nd4jLong e = 0; e < blockSize && e + (b * numBlocks) < lastLength; e++) {
                                auto v = longBytes<T>(blockBuffer[e]);
                                r = 31 * r + v;
                            }
@ -103,4 +103,3 @@ namespace nd4j {
        }
    }
 }
--- a/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/histogram.cpp
@ -49,7 +49,7 @@ namespace nd4j {
                    }
                    PRAGMA_OMP_SIMD
-                    for (int x = 0; x < numBins; x++) {
+                    for (Nd4jLong x = 0; x < numBins; x++) {
                        result[x] += bins[x];
                    }
--- a/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/im2col.cpp
@ -64,8 +64,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
    if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
        auto func = PRAGMA_THREADS_FOR_2D {
-            for (int b = start_x; b < stop_x; b++) {
+            for (auto b = start_x; b < stop_x; b++) {
-                for (int c = start_y; c < stop_y; c++) {
+                for (auto c = start_y; c < stop_y; c++) {
                    for (int kRow = 0; kRow < kH; ++kRow) {
                        for (int kCol = 0; kCol < kW; ++kCol) {
                            for (int colH = 0; colH < oH; ++colH) {
@ -98,8 +98,8 @@ static void im2col_(nd4j::LaunchContext & context, const NDArray& input,  NDArra
            T *col, *im;
            int imRow, imCol;
-            for (int b = start_x; b < stop_x; b += inc_x) {
+            for (auto b = start_x; b < stop_x; b += inc_x) {
-                for (int colH = start_y; colH < stop_y; colH += inc_y) {
+                for (auto colH = start_y; colH < stop_y; colH += inc_y) {
                    for (int colW = 0; colW < oW; ++colW) {
                        for (int c = 0; c < iC; ++c) {
                            for (int kRow = 0; kRow < kH; ++kRow) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/image_resize.cpp
@ -219,16 +219,16 @@ namespace helpers {
        auto func = PRAGMA_THREADS_FOR {
            for (auto batch = start; batch < stop; ++batch) {
                auto pInput = pInputBuf + batch * inBatchNumValues;
-                for (auto y = 0; y < outHeight; ++y) {
+                for (Nd4jLong y = 0; y < outHeight; ++y) {
                    auto pOutput = pOutputBuf + (batch * outHeight + y) * outRowSize;
                    const T* ysInputLowerPtr = pInput + ys[y]._bottomIndex * inRowSize;
                    const T* ysInputUpperPtr = pInput + ys[y]._topIndex * inRowSize;
                    double yVal = ys[y]._interpolarValue;
-                    for (auto x = 0; x < outWidth; ++x) {
+                    for (Nd4jLong x = 0; x < outWidth; ++x) {
                        auto xsBottom = xsPtr[x]._bottomIndex;
                        auto xsTop = xsPtr[x]._topIndex;
                        auto xVal = xsPtr[x]._interpolarValue;
-                        for (auto c = 0; c < channels; ++c) {
+                        for (Nd4jLong c = 0; c < channels; ++c) {
                            double topLeft(ysInputLowerPtr[xsBottom + c]);
                            double topRight(ysInputLowerPtr[xsTop + c]);
                            double bottomLeft(ysInputUpperPtr[xsBottom + c]);
@ -310,14 +310,14 @@ namespace helpers {
                    if (halfPixelCenter) {
                        inY = nd4j::math::nd4j_max(0LL, inY);
                    }
-                    for (auto x = 0; x < outWidth; ++x) {
+                    for (Nd4jLong x = 0; x < outWidth; ++x) {
                        auto posX = alignCorners ? static_cast<Nd4jLong>(nd4j::math::p_round<float>(scaler(x, st.widthScale))) : static_cast<Nd4jLong>(nd4j::math::p_floor<float>(scaler(x, st.widthScale)));
                        Nd4jLong inX = nd4j::math::nd4j_min(posX,inWidth - 1);
                        if (halfPixelCenter) {
                            inX = nd4j::math::nd4j_max(0LL, inX);
                        }
                        // copy pixel over all channels
-                        for (auto e = 0; e < channels; e++)
+                        for (Nd4jLong e = 0; e < channels; e++)
                            output->t<T>(b, y, x, e) = images->t<T>(b, inY, inX, e);
                    }
                }
@ -613,7 +613,7 @@ namespace helpers {
            for (auto b = start; b < stop; ++b) {
                auto pInput = inputPtr + b * inBatchWidth;
-                for (auto y = 0; y < outHeight; ++y) {
+                for (Nd4jLong y = 0; y < outHeight; ++y) {
                    auto pOutput = &pOutputY[(b * outHeight + y) * outWidth * numChannels];
                    WeightsAndIndices yWai;
@ -635,7 +635,7 @@ namespace helpers {
                        F cached_value_0[4] = {0};
                        F cached_value_1[4] = {0};
                        F cached_value_2[4] = {0};
-                        for (auto x = 0; x < resizerState.outWidth; ++x) {
+                        for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
                            const WeightsAndIndices &xWai = xWais[x];
                            // Shift values in cached_value_* to fill first '_advance' values.
                            switch (xWai._advance) {
@ -712,7 +712,7 @@ namespace helpers {
                                            xWai._weight2, xWai._weight3);
                        }
                    } else {
-                        for (auto x = 0; x < resizerState.outWidth; ++x) {
+                        for (Nd4jLong x = 0; x < resizerState.outWidth; ++x) {
                            const WeightsAndIndices &xWai = xWais[x];
                            // Shift values in cachedValue to fill first '_advance' values.
                            switch (xWai._advance) {
@ -828,7 +828,7 @@ namespace helpers {
        float sum_0 = 0;
        float sum_1 = 0;
        float sum_2 = 0;
-        for (int i = 0; i < yPtrs.size(); ++i) {
+        for (size_t i = 0; i < yPtrs.size(); ++i) {
            const T* ptr = yPtrs[i].yPtr;
            float scaleX = xCache.startScale;
            Nd4jLong offset = 3 * boundIfNeeded(xCache.start, st.inWidth);
@ -879,7 +879,7 @@ namespace helpers {
            const auto numChannels = st.channels;
            for (Nd4jLong c = 0; c < numChannels; ++c) {
                float sum = 0;
-                for (int i = 0; i < yPtrs.size(); ++i) {
+                for (size_t i = 0; i < yPtrs.size(); ++i) {
                    T const* ptr = yPtrs[i].yPtr;
                    float scaleX = xCache.startScale;
                    float sumY = static_cast<float>(ptr[numChannels * boundIfNeeded(xCache.start, st.inWidth) + c]) * scaleX;
--- a/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lrn.cpp
@ -62,7 +62,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
    if(inTadEws == 1 && outTadEws == 1) {
        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i++) {
+            for (auto i = start; i < stop; i++) {
                const T *x = inBuff + inTadOffsets[i];
                T *y = outBuff + outTadOffsets[i];
@ -70,7 +70,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
                // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                    const uint last = depth + j + 1;
                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -100,7 +100,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
    }
    else {
        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = 0; i < numOfTads; ++i) {
+            for (Nd4jLong i = 0; i < numOfTads; ++i) {
                const T *x = inBuff + inTadOffsets[i];
                T *y = outBuff + outTadOffsets[i];
@ -108,7 +108,7 @@ static int lrnFunctor_(nd4j::graph::Context& block, NDArray* input, NDArray* out
                // calculate squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                    const uint last = depth + j + 1;
                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -179,13 +179,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
    if(inTadEws == 1 && gradITadEws == 1) {
        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i++) {
+            for (auto i = start; i < stop; i++) {
                const X *x = inBuff + inTadOffsets[i];
                      Y *y = gradIBuff + gradITadOffsets[i];
                // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                    const uint last = depth + j + 1;
                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -208,7 +208,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
                Y prev = 0;
                // second loop calculates derivatives using information gained in first loop above
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                    const uint last = depth + j + 1;
                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -247,13 +247,13 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
    else {
        auto func = PRAGMA_THREADS_FOR {
-            for (uint i = start; i < stop; i++) {
+            for (auto i = start; i < stop; i++) {
                const X *x = inBuff + inTadOffsets[i];
                      Y *y = gradIBuff + gradITadOffsets[i];
                // this loop calculates squared sum of elements per each j-th element range [j - depth, j + depth + 1]
                // we store each squared sum in corresponding element of y array
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                    const uint last = depth + j + 1;
                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
@ -280,7 +280,7 @@ static void lrnBP_(const NDArray& input, const NDArray& gradO, NDArray& gradI, c
                Y prev = 0;
                // second loop calculates derivatives using information gained in first loop above
-                for (uint j = 0; j < tadLen; ++j) {
+                for (Nd4jLong j = 0; j < tadLen; ++j) {
                    const uint begin = nd4j::math::nd4j_max<int>(0, j - depth);
                    const uint last = depth + j + 1;
                    const uint end = nd4j::math::nd4j_min<int>(last, tadLen);
--- a/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/lstm.cpp
@ -124,7 +124,7 @@ static void fusedTanh(NDArray *z, NDArray *i, NDArray *c, const NDArray *cLast,
    auto h_ = h->bufferAsT<T>();
    auto func = PRAGMA_THREADS_FOR {
-        for (uint e = start; e < stop; e++) {
+        for (auto e = start; e < stop; e++) {
            c_[e] = z_[e] * i_[e] + (f_[e] * cLast_[e]);
            h_[e] = nd4j::math::nd4j_tanh<T, T>(c_[e]);
        }
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrix_band.cpp
@ -32,7 +32,7 @@ namespace helpers {
        Nd4jLong preLastDim = input->rankOf() - 2;
        ResultSet listOut = output->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
        ResultSet listDiag = input->allTensorsAlongDimension({(int)preLastDim, (int)lastDim});
-        for (Nd4jLong e = 0; e < listOut.size(); ++e) {
+        for (Nd4jLong e = 0; e < static_cast<Nd4jLong>(listOut.size()); ++e) {
            NDArray* inputMatrix = listDiag.at(e);
            NDArray* outputMatrix = listOut.at(e);
            if (outputMatrix != inputMatrix) // if not inplace
--- a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
@ -68,7 +68,7 @@ namespace nd4j {
                    if (shape::elementWiseStride(xShapeInfo) == 1 && shape::elementWiseStride(zShapeInfo) == 1 &&
                        shape::order(xShapeInfo) == 'c' && shape::order(zShapeInfo) == 'c') {
-                        for (int e = 0; e < length; e++) {
+                        for (Nd4jLong e = 0; e < length; e++) {
                            sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[e]) : simdOps::Multiply<T, T, T>::op(sum, x[e]);
                            if (!exclusive)
@ -81,7 +81,7 @@ namespace nd4j {
                    }
                    else {
-                        for (int e = 0; e < length; e++) {
+                        for (Nd4jLong e = 0; e < length; e++) {
                            auto xOffset = shape::getIndexOffset(e, xShapeInfo);
                            auto zOffset = shape::getIndexOffset(e, zShapeInfo);
--- a/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/qr.cpp
@ -43,8 +43,8 @@ namespace helpers {
        T const* vBuf = v.getDataBuffer()->primaryAsT<T>();
        T* resBuf = res.dataBuffer()->primaryAsT<T>();
        auto interloop = PRAGMA_THREADS_FOR_2D {
-            for (int i = start_x; i < n; i += inc_x)
+            for (auto i = start_x; i < n; i += inc_x)
-                for (int j = start_y; j < n; j += inc_y)
+                for (auto j = start_y; j < n; j += inc_y)
                    resBuf[i * n + j] = -2 * vBuf[i] * vBuf[j] + (i == j ? T(1) : T(0));
        };
@ -63,7 +63,7 @@ namespace helpers {
        NDArray z = *matrix;
        NDArray e('c', {M}, DataTypeUtils::fromT<T>()); // two internal buffers and scalar for squared norm
-        for (auto k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
+        for (Nd4jLong k = 0; k < N && k < M - 1; k++) { // loop for columns, but not further then row number
            e.nullify();
            z = matrixMinor<T>(z, k); // minor computing for current column with given matrix z (initally is a input matrix)
 //            z.printIndexedBuffer("Minor!!!");
@ -87,7 +87,7 @@ namespace helpers {
        }
        resQ.assign(q[0]); //
 //        MmulHelper::matmul(&q[0], matrix, &resR, false, false);
-        for (int i = 1; i < N && i < M - 1; i++) {
+        for (Nd4jLong i = 1; i < N && i < M - 1; i++) {
            auto tempResQ = resQ;
            MmulHelper::matmul(&q[i], &resQ, &tempResQ, false, false); // use mmulMxM?
            resQ = std::move(tempResQ);
--- a/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/random.cpp
@ -57,10 +57,10 @@ namespace helpers {
        T* outputBuf = output->dataBuffer()->primaryAsT<T>();
        PRAGMA_OMP_PARALLEL_FOR
-        for (auto k = 0; k < shift; k++) {
+        for (Nd4jLong k = 0; k < shift; k++) {
            auto pos = k * step;
            auto u = rng.relativeT<T>(k, 0., 1.);
-            for (auto e = 0; e < step; e++)
+            for (Nd4jLong e = 0; e < step; e++)
                    if (directOutput) {
                        outputBuf[pos + e] = math::nd4j_igamma<T, T, T>(copyAlpha->t<T>(e),
                                                                        beta != nullptr ? copyBeta->t<T>(e) * u : u);
@ -104,10 +104,10 @@ namespace helpers {
        bool directLa = lambda->ews() == 1 && lambda->ordering() == 'c';
        bool directOut = output->ews() == 1 && output->ordering() == 'c';
        PRAGMA_OMP_PARALLEL_FOR
-        for (auto k = 0; k < shift; k++) {
+        for (Nd4jLong k = 0; k < shift; k++) {
            auto pos = k * step;
            auto u = rng.relativeT<T>(k, 0., 1.);
-            for (auto e = 0; e < step; e++) {
+            for (Nd4jLong e = 0; e < step; e++) {
                auto p = math::nd4j_exp<T, T>(-lambda->t<T>(e));
                auto s = p;
                auto x = T(0.f);
@ -143,7 +143,7 @@ namespace helpers {
            RandomLauncher::fillUniform(context, rng, output, minVal, maxVal);
        else {
            PRAGMA_OMP_PARALLEL_FOR
-            for (auto i = 0; i < output->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < output->lengthOf(); i++) {
                output->t<T>(i) = rng.relativeT<T>(i, minVal, maxVal);
            }
        }
@ -184,7 +184,7 @@ namespace helpers {
                        auto nSamplesPerBatch = nBatchIndex * numOfClassX * numOfSamples;
                        auto nClassesPerSample = nSampleIndexInBatch * numOfClassX;
-                        for (auto nClass = 0; nClass < numOfClassX; nClass += 1) {
+                        for (Nd4jLong nClass = 0; nClass < numOfClassX; nClass += 1) {
                            auto nIndex = nSamplesPerBatch + nClassesPerSample + nClass;
                            auto unifornLog = nd4j::math::nd4j_log<Tx, Tx>(-nd4j::math::nd4j_log<Tx, Tx>(rng.relativeT<Tx>(nIndex, minVal, maxVal)));
                            Tx tValue = (xTad[nClass * xDimAstride] - unifornLog);
--- a/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/random_crop.cpp
@ -50,7 +50,7 @@ namespace helpers {
            width = lastDim;
        }
-        for (int i = 0; i < input->lengthOf(); i += lastDim) {
+        for (Nd4jLong i = 0; i < input->lengthOf(); i += lastDim) {
            for (Nd4jLong k = startPos; k < width && pos < output->lengthOf(); k++) {
                output->p(pos++, input->e<T>(i + k));
            }
--- a/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/roll.cpp
@ -110,7 +110,7 @@ namespace helpers {
            }
            else {
                std::vector<int> dims(source->rankOf() - axe - 1);
-                for (int i = 0; i < dims.size(); ++i)
+                for (size_t i = 0; i < dims.size(); ++i)
                    dims[i] = axe + 1 + i;
                ResultSet listOfTensors = source->allTensorsAlongDimension({dims});
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@ -55,9 +55,9 @@ static void batchToSpace_(const NDArray& input, NDArray& output, const uint crop
    // loop through output array
    auto func = PRAGMA_THREADS_FOR_3D {
-        for (uint b = start_x; b < stop_x; b += inc_x) {
+        for (auto b = start_x; b < stop_x; b += inc_x) {
-            for (uint h = start_y; h < stop_y; h += inc_y) {
+            for (auto h = start_y; h < stop_y; h += inc_y) {
-                for (uint w = start_z; w < stop_z; w += inc_z) {
+                for (auto w = start_z; w < stop_z; w += inc_z) {
                    for (uint c = 0; c < iC; ++c) {
                        const Nd4jLong xOffset = b * xShapeInfo[5] + h * xShapeInfo[6] + w * xShapeInfo[7] + c * xShapeInfo[8];
                        const Nd4jLong zOffset = b * zShapeInfo[5] + (h - cropBottom) * zShapeInfo[6] + (w - cropLeft) * zShapeInfo[7] + c * zShapeInfo[8];
@ -146,11 +146,11 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
    std::vector<Nd4jLong> temp(numOfSpatialDims + rank);
-    int i;
+    uint i;
    for(i = 0; i < numOfSpatialDims; ++i)
        temp[i] = blockShape.e<Nd4jLong>(i);
    temp[i++] = output.sizeAt(0);
-    for(int j = 1; j < rank; ++i, ++j)
+    for(uint j = 1; j < rank; ++i, ++j)
        temp[i] = input.sizeAt(j);
    NDArray inputRearranged0 = input.reshape(input.ordering(), temp);
@ -163,7 +163,7 @@ void batchToSpaceND(nd4j::LaunchContext* context, const NDArray& input, const ND
        temp[2*i - 1] = numOfSpatialDims + i;
        temp[2*i]     = i - 1;
    }
-    for(i = 2 * numOfSpatialDims + 1; i < temp.size(); ++i)
+    for(i = 2 * numOfSpatialDims + 1; i < static_cast<uint>(temp.size()); ++i)
        temp[i] = i;
    inputRearranged0.permutei(temp);
@ -216,8 +216,8 @@ static void spaceToBatch_(const NDArray& input, NDArray& output, const uint padB
    // loop through output array
    auto func = PRAGMA_THREADS_FOR_2D {
-        for (uint b = start_x; b < stop_x; b += inc_x) {
+        for (auto b = start_x; b < stop_x; b += inc_x) {
-            for (uint h = start_y; h < stop_y; h += inc_y) {
+            for (auto h = start_y; h < stop_y; h += inc_y) {
                for (uint w = 0; w < oW; ++w) {
                    for (uint c = 0; c < iC; ++c) {
--- a/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/segment.cpp
@ -87,7 +87,7 @@ namespace helpers {
        if (input->isVector()) {
            T val = input->e<T>(0);
-            for (int e = 1; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
                if (idx == indices->e<Nd4jLong>(e)) {
                   // min
                   val = nd4j::math::nd4j_min<T>(val, input->t<T>(e));
@ -115,7 +115,7 @@ namespace helpers {
            for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
                if (indices->e<Nd4jLong>(i) == idx) {
-                    for (int e = 0; e < minT->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < minT->lengthOf(); e++) {
                       minT->p(e, nd4j::math::nd4j_min(minT->e<T>(e), listOfTensors.at(i)->e<T>(e)));
                    }
                }
@ -138,7 +138,7 @@ namespace helpers {
            T val = T(0.f);
            int count = 0;
-            for (int e = 0; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                if (idx == indices->e<int>(e)) {
                   // mean
                   val += input->e<T>(e);
@ -166,7 +166,7 @@ namespace helpers {
            auto meanV = meanT->dup();
            meanV.assign(listOfTensors.at(0));
-            for (int i = 1; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
                if (indices->e<int>(i) == idx) {
                    auto func = PRAGMA_THREADS_FOR {
                        for (auto e = start; e < stop; e++) {
@ -198,7 +198,7 @@ namespace helpers {
        if (input->isVector()) {
            T val = T(0.f);
            int count = 0;
-            for (int e = 0; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                if (idx == indices->e<int>(e)) {
                   // sum
                   val += input->t<T>(e);
@ -220,7 +220,7 @@ namespace helpers {
            std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
            auto sumT = listOfOutTensors.at(idx);
-            for (int i = 0; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                if (indices->e<int>(i) == idx) {
                    auto func = PRAGMA_THREADS_FOR {
                        for (auto e = start; e < stop; e++) {
@ -248,7 +248,7 @@ namespace helpers {
            T val = input->e<T>(0);
            int count = 0;
-            for (int e = 1; e < indices->lengthOf(); e++) {
+            for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
                if (idx == indices->e<int>(e)) {
                   // sum
                   val *= input->e<T>(e);
@ -269,7 +269,7 @@ namespace helpers {
            int numOfClasses = output->sizeAt(0); // number of classes
            auto sumT = listOfOutTensors.at(idx);
            sumT->assign(listOfTensors.at(0));
-            for (int i = 1; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 1; i < indices->lengthOf(); i++) {
                if (indices->e<int>(i)  == idx) {
                    auto func = PRAGMA_THREADS_FOR {
                        for (auto e = start; e < stop; e++) {
@ -313,7 +313,7 @@ namespace helpers {
    bool segmentIndicesValidate(nd4j::LaunchContext * context, NDArray* indices, NDArray& expected, NDArray& output) {
        auto val = indices->e(0);
-        for (int e = 1; e < indices->lengthOf(); e++) {
+        for (Nd4jLong e = 1; e < indices->lengthOf(); e++) {
            output = indices->e(e);
            if (val.e<Nd4jLong>(0) > output.e<Nd4jLong>(0))
                return false;
@ -362,7 +362,7 @@ namespace helpers {
            for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                T val = input->e<T>(fi->second.at(0));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
                    val = nd4j::math::nd4j_max(val, input->e<T>(fi->second.at(idx)));
                }
                output->p(fi->first, val);
@ -380,7 +380,7 @@ namespace helpers {
            for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                auto outputT = listOfOutTensors.at(fi->first);
                outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (Nd4jLong idx = 1; idx < static_cast<Nd4jLong>(fi->second.size()); ++idx) {
                    auto maxT = listOfTensors.at(fi->second.at(idx));
                    for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
                        T val = nd4j::math::nd4j_max(maxT->e<T>(e), outputT->e<T>(e));
@ -432,7 +432,7 @@ namespace helpers {
            for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                auto outputT = listOfOutTensors.at(fi->first);
                outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                    auto minT = listOfTensors.at(fi->second.at(idx));
                    for (Nd4jLong e = 0; e < outputT->lengthOf(); ++e) {
@ -560,7 +560,7 @@ namespace helpers {
            for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                auto outputT = listOfOutTensors.at(fi->first);
                outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                    auto current = listOfTensors.at(fi->second.at(idx));
                    *outputT *= *current;
@ -584,7 +584,7 @@ namespace helpers {
        if (input->isVector()) { // 1D case
            for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                double sumValue = input->e<double>(fi->second.at(0));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                    sumValue += input->e<double>(fi->second.at(idx));
                }
                output->p(fi->first, sumValue / nd4j::math::nd4j_sqrt<Nd4jLong, double>(fi->second.size()));
@ -599,7 +599,7 @@ namespace helpers {
            for (auto fi = idxs.begin(); fi != idxs.end(); ++fi) {
                auto outputT = listOfOutTensors.at(fi->first);
                outputT->assign(listOfTensors.at(fi->second.at(0)));
-                for (Nd4jLong idx = 1; idx < fi->second.size(); ++idx) {
+                for (size_t idx = 1; idx < fi->second.size(); ++idx) {
                    auto current = listOfTensors.at(fi->second.at(idx));
                    *outputT += *current;
                }
@ -651,7 +651,7 @@ namespace helpers {
                    auto currentOut = listOfOutTensors.at(i);
                    auto currentGradOut = listOfGradOuts.at(classNum);
-                    for (uint64_t e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                        if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<T>(e) - current->e<T>(e)) <= T(1.e-6))
                            currentOut->p(e, currentGradOut->e<T>(e));
                    }
@ -703,7 +703,7 @@ namespace helpers {
                    auto currentOut = listOfOutTensors.at(i);
                    auto currentGradOut = listOfGradOuts.at(classNum);
-                    for (int e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                        if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->e<double>(e) - current->e<double>(e)) <
                            1.e-5)
                            currentOut->p(e, currentGradOut->e<double>(e));
@ -746,13 +746,13 @@ namespace helpers {
            int pos = 0;
            //auto func = [&](uint64_t thread_id, uint64_t start, uint64_t stop, uint64_t increment) -> void {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
                    auto currentGradOut = listOfGradOuts.at(classNum);
-                    for (int e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                        currentOut->p(e, currentGradOut->e<double>(e) / classCount.at(classNum));
                    }
                }
@ -781,7 +781,7 @@ namespace helpers {
            ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
@ -817,7 +817,7 @@ namespace helpers {
            //std::vector<std::pair<NDArray*, int>> outputs(numOfClasses);
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
@ -860,7 +860,7 @@ namespace helpers {
            ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
            ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
-            for (int i = 0; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                Nd4jLong classNum = indices->e<Nd4jLong>(i);
                NDArray* current = listOfTensors.at(i);
                NDArray* currentOut = listOfOutTensors.at(i);
@ -905,13 +905,13 @@ namespace helpers {
            ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
                    auto currentGradOut = listOfGradOuts.at(classNum);
-                    for (int e = 0; e < current->lengthOf(); e++) {
+                    for (Nd4jLong e = 0; e < current->lengthOf(); e++) {
                        if (nd4j::math::nd4j_abs(listOfBPTensors.at(classNum)->t<T>(e) - current->t<T>(e)) < 1.e-6)
                            currentOut->t<T>(e) = currentGradOut->t<T>(e);
                    }
@ -955,7 +955,7 @@ namespace helpers {
            ResultSet listOfTensors = input->allTensorsAlongDimension(restDims);
            ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
-            for (int i = 0; i < indices->lengthOf(); i++) {
+            for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                Nd4jLong classNum = indices->e<Nd4jLong>(i);
                NDArray* current = listOfTensors.at(i);
                NDArray* currentOut = listOfOutTensors.at(i);
@ -984,7 +984,7 @@ namespace helpers {
            ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto currentOut = listOfOutTensors.at(i);
                    auto currentGradOut = listOfGradOuts.at(classNum);
@ -1021,7 +1021,7 @@ namespace helpers {
            ResultSet listOfOutTensors = output->allTensorsAlongDimension(restDims);
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
@ -1053,7 +1053,7 @@ namespace helpers {
        // if input is a vector: (as if in doc sample)
        if (input->isVector()) {
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto e = 0; e < indices->lengthOf(); e++) {
+                for (Nd4jLong e = 0; e < indices->lengthOf(); e++) {
                    auto classNum = indices->e<Nd4jLong>(e);
                    output->p(e, gradOut->e<double>(classNum) / nd4j::math::nd4j_sqrt<double, double>(classCount[classNum]));
                }
@ -1069,7 +1069,7 @@ namespace helpers {
            ResultSet listOfOutTensors  =output->allTensorsAlongDimension(restDims);
            //auto func = PRAGMA_THREADS_FOR {
-                for (auto i = 0; i < indices->lengthOf(); i++) {
+                for (Nd4jLong i = 0; i < indices->lengthOf(); i++) {
                    auto classNum = indices->e<Nd4jLong>(i);
                    auto current = listOfTensors.at(i);
                    auto currentOut = listOfOutTensors.at(i);
--- a/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/sg_cb.cpp
@ -378,7 +378,7 @@ namespace nd4j {
                                int irow = 0;
                                auto cShift = t * idxShift;
-                                for (int e = 0; e < hsRounds; e++) {
+                                for (Nd4jLong e = 0; e < hsRounds; e++) {
                                    irow = bIndices[e + cShift];
                                    if (irow < 0 || irow >= vocabSize)
                                        continue;
@ -457,7 +457,7 @@ namespace nd4j {
                    T sneu1[600];
                    T sneu1e[600];
-                    for (int e = start; e < stop; e++) {
+                    for (auto e = start; e < stop; e++) {
                        T *neu1 = vectorLength <= 600 ? sneu1 : new T[vectorLength];
                        T *neu1e = vectorLength <= 600 ? sneu1e : new T[vectorLength];
@ -500,7 +500,7 @@ namespace nd4j {
                        // hierarchic softmax step
                        if (!indices.isEmpty()) {
-                            for (int i = 0; i < numIndices; i++) {
+                            for (Nd4jLong i = 0; i < numIndices; i++) {
                                const int cIndex = bIndices[(e * numIndices) + i];
                                const int cCode = bCodes[(e * numIndices) + i];
--- a/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/solve.cpp
@ -41,8 +41,8 @@ namespace helpers {
        auto batchLoop = PRAGMA_THREADS_FOR {
            for (auto batch = start; batch < stop; batch++) {
-                for (auto r = 0; r < rows; r++) {
+                for (Nd4jLong r = 0; r < rows; r++) {
-                    for (auto c = 0; c < r; c++) {
+                    for (Nd4jLong c = 0; c < r; c++) {
                        math::nd4j_swap(outputPart[batch]->t<T>(r, c) , outputPart[batch]->t<T>(c, r));
                    }
                }
@ -66,7 +66,7 @@ namespace helpers {
        auto permutationsPart = permutations.allTensorsAlongDimension({-1});
        for (auto batch = 0; batch < permutationsPart.size(); ++batch) {
-            for (auto row = 0; row < PPart[batch]->rows(); ++row) {
+            for (Nd4jLong row = 0; row < PPart[batch]->rows(); ++row) {
                PPart[batch]->t<T>(row, permutationsPart[batch]->t<int>(row)) = T(1.f);
            }
        }
@ -77,7 +77,7 @@ namespace helpers {
        MmulHelper::matmul(&P, rightInput, &rightPermuted, 0, 0);
        ResultSet leftLowerPart = leftLower.allTensorsAlongDimension({-2, -1});
        for (auto i = 0; i < leftLowerPart.size(); i++) {
-            for (auto r = 0; r < leftLowerPart[i]->rows(); r++)
+            for (Nd4jLong r = 0; r < leftLowerPart[i]->rows(); r++)
                leftLowerPart[i]->t<T>(r,r) = (T)1.f;
        }
        // stage 2: triangularSolveFunctor for Lower with given b
--- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
@ -29,7 +29,7 @@ namespace helpers {
            //////////////////////////////////////////////////////////////////////////
            template <typename T>
            static void split_(const NDArray& input, const std::vector<NDArray*>& outArrs, const int axis) {
-                int numSplits = outArrs.size();
+                uint numSplits = outArrs.size();
                const auto sizeofT = input.sizeOfT();
@ -73,9 +73,9 @@ namespace helpers {
                if (luckCase2) {
-                    const uint xDim = input.sizeAt(axis);
+                    const auto xDim = input.sizeAt(axis);
-                    for (uint i = 0; i < input.lengthOf() / xDim; ++i) {
+                    for (Nd4jLong i = 0; i < input.lengthOf() / xDim; ++i) {
                        T* x = xBuff + xDim * i;
--- a/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/top_k.cpp
@ -39,7 +39,7 @@ namespace helpers {
 //        }
 // ----------------------------------------------------------------------------------------------- //
        std::vector<int> dimsToExclude(input->rankOf() - 1);
-        for (int d = 0; d < dimsToExclude.size(); ++d)
+        for (size_t d = 0; d < dimsToExclude.size(); ++d)
            dimsToExclude[d] = d;
        const Nd4jLong numOfSubArrs = ShapeUtils::getNumOfSubArrs(input->getShapeInfo(), dimsToExclude);
@ -72,7 +72,7 @@ namespace helpers {
                    NDArray topValues = NDArrayFactory::create<T>('c', {k});
                    NDArray sortedVals = NDArrayFactory::create<T>('c', {k});
                    NDArray topIndices = NDArrayFactory::create<Nd4jLong>('c', {k});
-                    for (Nd4jLong pos = 0; pos < k; ++pos) {
+                    for (uint pos = 0; pos < k; ++pos) {
                        topIndices.t<Nd4jLong>(pos) = pos;
                        topValues.t<T>(pos) = trial.t<T>(pos);
                    }
@ -80,7 +80,7 @@ namespace helpers {
                    sortedVals.assign(topValues);// = NDArrayFactory::create<T>('c', {k});
                    //std::sort(sortedVals.begin(), sortedVals.end()); // sorted in ascending order
                    SpecialMethods<T>::sortGeneric(sortedVals.buffer(), sortedVals.shapeInfo(), false);
-                    for (int i = k; i < width; ++i) {
+                    for (Nd4jLong i = static_cast<Nd4jLong>(k); i < width; ++i) {
                        T val = trial.e<T>(i);
                        T minTopVal = sortedVals.t<T>(0);
                        if (minTopVal < val) { // value should be inserted to top k
@ -104,15 +104,15 @@ namespace helpers {
                    if (needSort) {
                        SpecialMethods<T>::sortGeneric(topValues.buffer(), topValues.shapeInfo(), true);
-                        for (int j = 0; j < width; j++)
+                        for (Nd4jLong j = 0; j < width; j++)
-                            for (int pos = 0; pos < k; ++pos)
+                            for (uint pos = 0; pos < k; ++pos)
                                if (topValues.t<T>(pos) == trial.t<T>(j))
                                    topIndices.t<Nd4jLong>(pos) = j;
                    }
                    else { // else sort by indices
                        std::map<Nd4jLong, T> sortValsMap;
                        //std::vector<std::pair<int, T>> data(topValues.lengthOf());
-                        for (size_t e = 0; e < topValues.lengthOf(); ++e) {
+                        for (Nd4jLong e = 0; e < topValues.lengthOf(); ++e) {
                            sortValsMap[topIndices.t<Nd4jLong>(e)] = topValues.t<T>(e);
                        }
@ -152,7 +152,7 @@ namespace helpers {
                auto func = PRAGMA_THREADS_FOR {
                    for (auto e = start; e < stop; e++) {
                        bool found = false;
-                        for (int j = 0; j < k; j++) {
+                        for (uint j = 0; j < k; j++) {
                            if (target->e<Nd4jLong>(e) == indices->e<Nd4jLong>(e * k + j)) {
                                found = true;
                                break;
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@ -597,7 +597,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
                zCoordStart[yRank - 1] = coordToRestore;
            // construct coordinates for x
-            for (uint j = 0; j < yLastDim; ++j)
+            for (int j = 0; j < yLastDim; ++j)
                xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
@ -628,7 +628,7 @@ static void gather_(NDArray* input, const NDArray* indices, NDArray* output, con
    if (indices != nullptr) {
-        for(int i = 0; i < indices->lengthOf(); ++i)
+        for(Nd4jLong i = 0; i < indices->lengthOf(); ++i)
            if(indices->e<Nd4jLong>(i) >= input->sizeAt(axis))
                throw std::runtime_error("helpers::gather function: indices array contains wrong elements, each element must be smaller than corresponding dimension of input array !");
@ -733,7 +733,7 @@ void scatterUpdate(nd4j::LaunchContext * context, NDArray& input, NDArray& updat
    // increasing counter to skip numIndices
    e++;
    std::vector<int> indices;
-    for (; e < intArgs->size(); e++)
+    for (; e < static_cast<Nd4jLong>(intArgs->size()); e++)
        indices.push_back((*intArgs)[e]);
    auto func = PRAGMA_THREADS_FOR {
@ -813,7 +813,7 @@ static void mergeMaxIndex_(const std::vector<NDArray*>& inArrs, NDArray& output)
            T max = -DataTypeUtils::max<T>();
            Nd4jLong idx = 0;
-            for (int i = 0; i < numArgs; i++) {
+            for (Nd4jLong i = 0; i < numArgs; i++) {
                T v = inArrs[i]->e<T>(e);
                if (v > max) {
                    max = v;
@ -841,7 +841,7 @@ static void mergeMax_(const std::vector<NDArray*>& inArrs, NDArray& output) {
    auto func = PRAGMA_THREADS_FOR {
        for (auto e = start; e < stop; e++) {
            T max = -DataTypeUtils::max<T>();
-            for (int i = 0; i < numArgs; i++) {
+            for (Nd4jLong i = 0; i < numArgs; i++) {
                T v = inArrs[i]->e<T>(e);
                if (v > max)
                    max = v;
@ -867,7 +867,7 @@ static void mergeAvg_(const std::vector<NDArray*>& inArrs, NDArray& output) {
    auto func = PRAGMA_THREADS_FOR {
        for (auto e = start; e < stop; e++) {
            T sum = 0.;
-            for (int i = 0; i < numArgs; i++) {
+            for (Nd4jLong i = 0; i < numArgs; i++) {
                T v = inArrs[i]->e<T>(e);
                sum += v;
            }
@ -893,7 +893,7 @@ static void mergeAdd_(const std::vector<NDArray*>& inArrs, NDArray& output) {
    auto func = PRAGMA_THREADS_FOR {
        for (auto e = start; e < stop; e++) {
            T sum = (T) 0.f;
-            for (int i = 0; i < numArgs; i++)
+            for (Nd4jLong i = 0; i < numArgs; i++)
                sum += inArrs[i]->e<T>(e);
            output.p(e, sum);
@ -1242,7 +1242,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
        memset(gradIBuff, 0, gradILen * sizeof(T));
    else {
        //PRAGMA_OMP_PARALLEL_FOR_SIMD
-        for (int i = 0; i < gradILen * gradIEWS; i += gradIEWS)
+        for (Nd4jLong i = 0; i < gradILen * gradIEWS; i += gradIEWS)
            gradIBuff[i] = static_cast<T>(0.f);
    }
--- a/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/triangular_solve.cpp
@ -43,10 +43,10 @@ namespace helpers {
        auto rows = leftInput->rows();
        auto cols = rightInput->columns();
        //output->t<T>(0,0) = rightInput->t<T>(0,0) / leftInput->t<T>(0,0);
-        for (auto r = 0; r < rows; r++) {
+        for (Nd4jLong r = 0; r < rows; r++) {
-            for (auto j = 0; j < cols; j++) {
+            for (Nd4jLong j = 0; j < cols; j++) {
                auto sum = rightInput->t<T>(r, j);
-                for (auto c = 0; c < r; c++) {
+                for (Nd4jLong c = 0; c < r; c++) {
                    sum -= leftInput->t<T>(r, c) * output->t<T>(c, j);
                }
                output->t<T>(r, j) = sum / leftInput->t<T>(r, r);
@ -72,10 +72,10 @@ namespace helpers {
    static void upperTriangularSolve(nd4j::LaunchContext * context, NDArray* leftInput, NDArray* rightInput, bool adjoint, NDArray* output) {
        auto rows = leftInput->rows();
        auto cols = rightInput->columns();
-        for (auto r = rows; r > 0; r--) {
+        for (Nd4jLong r = rows; r > 0; r--) {
-            for (auto j = 0; j < cols; j++) {
+            for (Nd4jLong j = 0; j < cols; j++) {
                auto sum = rightInput->t<T>(r - 1, j);
-                for (auto c = r; c < rows; c++) {
+                for (Nd4jLong c = r; c < rows; c++) {
                    sum -= leftInput->t<T>(r - 1, c) * output->t<T>(c, j);
                }
                output->t<T>(r - 1, j) = sum / leftInput->t<T>(r - 1, r - 1);
@ -114,14 +114,14 @@ namespace helpers {
        auto batchLoop = PRAGMA_THREADS_FOR {
            for (auto batch = start; batch < stop; batch++) {
                if (!lower) {
-                    for (auto r = 0; r < rows; r++) {
+                    for (Nd4jLong r = 0; r < rows; r++) {
-                        for (auto c = 0; c <= r; c++) {
+                        for (Nd4jLong c = 0; c <= r; c++) {
                            outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
                        }
                    }
                } else {
-                    for (auto r = 0; r < rows; r++) {
+                    for (Nd4jLong r = 0; r < rows; r++) {
-                        for (auto c = r; c < cols; c++) {
+                        for (Nd4jLong c = r; c < cols; c++) {
                            outputPart[batch]->t<T>(r, c) = inputPart[batch]->t<T>(c, r);
                        }
                    }
--- a/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/weights.cpp
@ -26,7 +26,7 @@ namespace helpers {
    template <typename T>
    static void adjustWeights_(NDArray* input, NDArray* weights, NDArray* output, int minLength, int maxLength) {
-            for (int e = 0; e < input->lengthOf(); e++) {
+            for (Nd4jLong e = 0; e < input->lengthOf(); e++) {
                int val = input->e<int>(e);
                if (val < maxLength) {
                    if (weights != nullptr)