From 58550b7c98390d008bae637db10500e891064265 Mon Sep 17 00:00:00 2001
From: Yurii Shyrma <iuriish@yahoo.com>
Date: Wed, 11 Mar 2020 15:21:59 +0200
Subject: [PATCH] [WIP] Shyrma coords (#305)

* - provide faster index2coords function for cpu

Signed-off-by: Yurii <iuriish@yahoo.com>

* - new faster index2coords function is introduced into cpu code

Signed-off-by: Yurii <iuriish@yahoo.com>

* - replace long long coordinates with int coordinates

Signed-off-by: Yurii <iuriish@yahoo.com>

* - add missed reload of coords2index function

Signed-off-by: Yurii <iuriish@yahoo.com>

* - reststart  jenkins

Signed-off-by: Yurii <iuriish@yahoo.com>

* - rollback changes in convolutions.cu and addBias.cu

Signed-off-by: Yurii <iuriish@yahoo.com>
---
 libnd4j/include/array/cpu/NDArray.cpp         |  27 +++--
 libnd4j/include/array/cuda/NDArray.cu         |  26 ++--
 .../helpers/benchmark/ParametersBatch.h       |   2 +-
 libnd4j/include/helpers/cpu/MmulHelper.cpp    |   6 +-
 .../include/helpers/cuda_off/MmulHelper.cu    |  16 +--
 libnd4j/include/helpers/impl/ShapeUtils.cpp   |   1 -
 libnd4j/include/helpers/shape.h               |  98 ++++++++++------
 libnd4j/include/loops/cpu/broadcasting.hpp    |   4 +-
 .../include/loops/cpu/broadcasting_bool.hpp   |   4 +-
 .../include/loops/cpu/broadcasting_int.hpp    |   4 +-
 libnd4j/include/loops/cuda/broadcasting.chpp  |   2 +-
 .../include/loops/cuda/broadcasting_bool.cu   |   2 +-
 .../include/loops/cuda/broadcasting_int.cu    |   2 +-
 .../loops/cuda/specials/concatKernel.cu       |  10 +-
 .../generic/compat/compat_string_split.cpp    |   5 +-
 .../ops/declarable/helpers/cpu/batchnorm.cpp  |  36 +++---
 .../declarable/helpers/cpu/imagesHelpers.cpp  |  34 +++---
 .../declarable/helpers/cpu/matrixSetDiag.cpp  |   7 +-
 .../ops/declarable/helpers/cpu/s_t_b.cpp      |  34 ++++--
 .../ops/declarable/helpers/cpu/scatter.cpp    |   4 +-
 .../ops/declarable/helpers/cpu/split.cpp      |   9 +-
 .../ops/declarable/helpers/cpu/transforms.cpp | 111 ++++++++++--------
 .../declarable/helpers/cuda/activations.cu    |   4 +-
 .../ops/declarable/helpers/cuda/addBias.cu    |   4 +-
 .../ops/declarable/helpers/cuda/batchnorm.cu  |   2 +-
 .../ops/declarable/helpers/cuda/concat.cu     |   2 +-
 .../declarable/helpers/cuda/convolutions.cu   |  14 +--
 .../ops/declarable/helpers/cuda/cross.cu      |   8 +-
 .../ops/declarable/helpers/cuda/dilation2d.cu |   8 +-
 .../ops/declarable/helpers/cuda/flatten.cu    |   2 +-
 .../ops/declarable/helpers/cuda/im2col.cu     |   8 +-
 .../declarable/helpers/cuda/imagesHelpers.cu  |  10 +-
 .../declarable/helpers/cuda/matrixSetDiag.cu  |  10 +-
 .../ops/declarable/helpers/cuda/s_t_b.cu      |  32 ++---
 .../ops/declarable/helpers/cuda/scatter.cu    |   8 +-
 .../ops/declarable/helpers/cuda/split.cu      |   2 +-
 .../ops/declarable/helpers/cuda/sru.cu        |  20 ++--
 .../ops/declarable/helpers/cuda/transforms.cu |  26 ++--
 .../helpers/impl/sparse_to_dense.cpp          |   2 +-
 .../ops/declarable/helpers/impl/where.cpp     |   7 +-
 libnd4j/include/ops/impl/specials_single.hpp  |  16 ++-
 .../layers_tests/PlaygroundTests.cpp          |   1 +
 libnd4j/tests_cpu/layers_tests/TadTests.cpp   |   2 +-
 43 files changed, 359 insertions(+), 273 deletions(-)

diff --git a/libnd4j/include/array/cpu/NDArray.cpp b/libnd4j/include/array/cpu/NDArray.cpp
index 7d91d1373..1d97ba61c 100644
--- a/libnd4j/include/array/cpu/NDArray.cpp
+++ b/libnd4j/include/array/cpu/NDArray.cpp
@@ -95,22 +95,29 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
 
     const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target.getShapeInfo());
 
-
     auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+
+        int coords[MAX_RANK], temp;
+
         for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, target.getShapeInfo(), coords);
+
+            shape::index2coordsCPU(start, i, target.getShapeInfo(), coords);
             const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
 
             // if( (row + upper < col) || (row + lower > col) )
             if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
                 z[zOffset] = value;
             else if (this != &target) {      // when this and target are different arrays
-                if (xRank != zRank)
+                if (xRank != zRank) {
+                    temp = coords[0];
                     coords[0] = coords[1];
+                }
 
                 const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords);
                 z[zOffset] = x[xOffset];
+
+                if (xRank != zRank)     // restore first coordinate
+                    coords[0] = temp;
             }
         }
     };
@@ -376,12 +383,16 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
 
     // loop through input array
     auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, output.getShapeInfo(), coords);
 
+        int coords[MAX_RANK], temp;
+
+        for (auto i = start; i < stop; i++) {
+
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
+            temp = coords[axis];
+
             if (repSize > 1) {
                 for (uint j = 0; j < repSize; ++j) {
                     coords[axis] -= repeats[j];
@@ -394,6 +405,8 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
                 coords[axis] /= repeats[0];
 
             z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+
+            coords[axis] = temp;
         }
     };
 
diff --git a/libnd4j/include/array/cuda/NDArray.cu b/libnd4j/include/array/cuda/NDArray.cu
index fcd0b6a9d..9ddb8be59 100644
--- a/libnd4j/include/array/cuda/NDArray.cu
+++ b/libnd4j/include/array/cuda/NDArray.cu
@@ -85,12 +85,12 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
     const auto x = reinterpret_cast<const T*>(vx);
           auto z = reinterpret_cast<T*>(vz);
 
-    __shared__ int zRank, xRank, areSameOffsets;        // xRank == zRank always, except when xRank = 1, in this case zRank = 2
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
+    __shared__ int zRank, xRank, areSameOffsets, *sharedMem;    // xRank == zRank always, except when xRank = 1, in this case zRank = 2
+    __shared__ Nd4jLong zLen, totalThreads;                     // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
         areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
         xRank = shape::rank(xShapeInfo);
         zRank = shape::rank(zShapeInfo);
@@ -137,7 +137,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(decltype(*target.getShapeInfo())) * target.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * target.rankOf() + 128;
 
     PointersManager manager(getContext(), "NDArray::fillAsTriangular");
 
@@ -155,12 +155,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
 
     auto x = reinterpret_cast<T*>(vx);
 
-    __shared__ int rank;
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
+    __shared__ int rank, *sharedMem;
+    __shared__ Nd4jLong len, totalThreads;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
         rank = shape::rank(xShapeInfo);
         len  = shape::length(xShapeInfo);
         totalThreads = gridDim.x * blockDim.x;
@@ -201,7 +201,7 @@ void NDArray::setIdentity() {
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(decltype(getShapeInfo())) * rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * rankOf() + 128;
 
     PointersManager manager(getContext(), "NDArray::setIdentity");
 
@@ -398,13 +398,13 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
     const X* x = reinterpret_cast<const X*>(vx);
           Z* z = reinterpret_cast<Z*>(vz);
 
-    __shared__ int rank;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;  // xLen = zLen
+    __shared__ int rank, *sharedMem;
+    __shared__ Nd4jLong zLen, totalThreads;  // xLen = zLen
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         rank = shape::rank(zShapeInfo);     // xRank = zRank
         zLen = shape::length(zShapeInfo);    // xLen <= zLen
@@ -460,7 +460,7 @@ NDArray NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = output.rankOf() * sizeof(int) * threadsPerBlock  + 128;
 
     PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
 
@@ -484,7 +484,7 @@ void NDArray::repeat(const int axis, const std::vector<int>& repeats, NDArray& t
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = target.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = target.rankOf() * sizeof(int) * threadsPerBlock  + 128;
 
     PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
 
diff --git a/libnd4j/include/helpers/benchmark/ParametersBatch.h b/libnd4j/include/helpers/benchmark/ParametersBatch.h
index 5a045d5cd..c477f3ea8 100644
--- a/libnd4j/include/helpers/benchmark/ParametersBatch.h
+++ b/libnd4j/include/helpers/benchmark/ParametersBatch.h
@@ -46,7 +46,7 @@ namespace sd {
             int totalIterations = 1;
 
             // hehe
-            Nd4jLong xCoords[MAX_RANK];
+            int xCoords[MAX_RANK];
             Nd4jLong xShape[MAX_RANK];
             int xRank = _spaces.size();
 
diff --git a/libnd4j/include/helpers/cpu/MmulHelper.cpp b/libnd4j/include/helpers/cpu/MmulHelper.cpp
index edbc45fd4..62d8153ef 100644
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@@ -63,7 +63,7 @@ static  void usualGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
         for (auto i = start; i < stop; ++i) {
 
             // evaluate C coordinates
-            shape::index2coords(i, cShapeInfo, cCoords.data());
+            shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
 
             // evaluate A coordinates
             aCoords[aMaxis] = cCoords[cMaxis];
@@ -433,12 +433,12 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB,  NDArray* vC,
 
     auto func = PRAGMA_THREADS_FOR {
 
-        std::vector<Nd4jLong> aCoords(aRank), bCoords(bRank), cCoords(cRank);
+        std::vector<int> aCoords(aRank), bCoords(bRank), cCoords(cRank);
 
         for (auto i = start; i < stop; ++i) {
 
             // evaluate C coordinates
-            shape::index2coords(i, cShapeInfo, cCoords.data());
+            shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
 
             // calculate index of current batch
             Nd4jLong batchInd;
diff --git a/libnd4j/include/helpers/cuda_off/MmulHelper.cu b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
index 13fc3b1b5..5e9304e88 100644
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@@ -40,15 +40,15 @@ static __global__ void usualCudaGemm(const void* vA, const Nd4jLong* aShapeInfo,
     const T2* B = reinterpret_cast<const T2*>(vB);
           T3* C = reinterpret_cast<      T3*>(vC);
 
-    __shared__ int K;
+    __shared__ int K, *coords;
     __shared__ bool betaPresent;
-    __shared__ Nd4jLong cLen, totalThreads, *coords;
+    __shared__ Nd4jLong cLen, totalThreads;
     __shared__ T3 alphaZ, betaZ;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        coords = reinterpret_cast<Nd4jLong*>(shmem);
+        coords = reinterpret_cast<int*>(shmem);
         cLen = shape::length(cShapeInfo);
 
         K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
@@ -263,7 +263,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
 
         const int threadsPerBlock = MAX_NUM_THREADS / 2;
         const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 6 + 128;                             // 6 = aRank + bRank + cRank
+        const int sharedMem = threadsPerBlock * sizeof(int) * 6 + 128;                             // 6 = aRank + bRank + cRank
 
         NDArray::prepareSpecialUse({C}, {A, B});
         // BUILD_TRIPLE_SELECTOR(aType, bType, cType, usualGemm, (blocksPerGrid, threadsPerBlock, sharedMem, stream, A->getSpecialBuffer(), A->getSpecialShapeInfo(), B->getSpecialBuffer(), B->getSpecialShapeInfo(), C->getSpecialBuffer(), C->getSpecialShapeInfo(), 0, 1, 0, 1, 0, 1, alpha, beta), NUMERIC_TYPES, NUMERIC_TYPES, FLOAT_TYPES);
@@ -529,14 +529,14 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf
           T3* C = reinterpret_cast<      T3*>(vC);
 
     __shared__ bool betaPresent;
-    __shared__ int aRank, bRank, cRank, K;
-    __shared__ Nd4jLong cLen, totalThreads, *coords;
+    __shared__ int aRank, bRank, cRank, K, *coords;
+    __shared__ Nd4jLong cLen, totalThreads;
     __shared__ T3 alphaZ, betaZ;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        coords = reinterpret_cast<Nd4jLong*>(shmem);
+        coords = reinterpret_cast<int*>(shmem);
         cLen = shape::length(cShapeInfo);
 
         K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
@@ -649,7 +649,7 @@ NDArray* MmulHelper::mmulNxN(const NDArray* A, const NDArray* B, NDArray* C, con
 
     const int threadsPerBlock = MAX_NUM_THREADS / 8;
     const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * (aRank + bRank + cRank) + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * (aRank + bRank + cRank) + 128;
 
     PointersManager manager(A->getContext(), "MmulHelper::mmulNxN");
 
diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp
index fdccd51bc..aa8e917cc 100644
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@@ -306,7 +306,6 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
 
     if(repeats.size() == 1)
         outShape[axis] *= repeats[0];
-
     else
         outShape[axis] = std::accumulate(repeats.begin(), repeats.end(), 0);
 
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index 4149996f7..81e42d159 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -915,12 +915,14 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,  uint *coords);
     ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
     ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, int *coords);
+
+    ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
+    ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords);
+
     /**
     * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
     */
-    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims);
-
-
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims);
 
     /**
     * Convert coordinates to the corresponding linear index (sequence number in other words)
@@ -929,11 +931,11 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords);
     ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords);
     ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *coords);
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *coords);
     /**
     * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
     */
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims);
 
    /**
    * increment n-dimensional array by one iteration by changing coord appropriately
@@ -988,17 +990,17 @@ namespace shape {
     // function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
     // dimsToExclude - should be sorted in increasing order
     // dimsLen - length of dimsToExclude, if not set (= -1), then it is calculated as maxRank - minRank
-    ND4J_EXPORT _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
+    ND4J_EXPORT _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
 
     // calculate indexes of max-array, these output indexes correspond to one minIdx index of min-array which is sub-array of max-array
     // dimsToExclude - should be sorted in increasing order
-    ND4J_EXPORT _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
+    ND4J_EXPORT _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
 
     // calculate offsets of max-array, these offsets correspond to one minIdx index of min-array which is sub-array of max-array
     // maxOffsets - will contain calculated offsets of max-array, buffer for maxOffsets should be allocated beforehand
     // dimsToExclude - should be sorted in increasing order
     // memBuff - auxiliary memory buffer (size = 2 * max_rank) for coordinates and increments storing, should be allocated beforehand
-    ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude = nullptr);
+    ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude = nullptr);
 
     // calculates offsets for entities (elements or sub-arrays), shape in context of sub-array means dimensions excluded from outer array
     // rank is equal to size of shape
@@ -1064,7 +1066,7 @@ namespace shape {
     * get stride over contiguous axis (contiguous axis must have stride = 1)
     * for example when inShapeInfo is {4, 2,5,4,3,  60,1,5,20, 16384,0,99} then output is 5 (that is smallest stride in inShapeInfo except those equal to 1)
     */
-    INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
+    // INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
 
 
 
@@ -1832,7 +1834,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *indices) {
 
     Nd4jLong index, shift = 1;;
 
@@ -1845,7 +1847,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape,
     return index;
 }
 
-INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims) {
 
     Nd4jLong index, shift = 1;;
 
@@ -4276,7 +4278,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
 
 // max array is outer for min array, min array is sub-array of max array
 // function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
-INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
+INLINEDEF _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
 
     const auto maxRank = shape::rank(maxShapeInfo);
     const auto minRank = shape::rank(minShapeInfo);
@@ -4362,10 +4364,10 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
     //////////////////////////////////////////////////////////////////////
     INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
 
-        Nd4jLong maxIdxs[MAX_RANK];
+        int maxIdxs[MAX_RANK];
         shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
 
-        Nd4jLong minIdxs[MAX_RANK];
+        int minIdxs[MAX_RANK];
         maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
 
         return shape::coords2index(minShapeInfo, minIdxs);
@@ -4374,17 +4376,17 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
     //////////////////////////////////////////////////////////////////////
     INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
 
-        Nd4jLong maxIdxs[MAX_RANK];
+        int maxIdxs[MAX_RANK];
         shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
 
-        Nd4jLong minIdxs[MAX_RANK];
+        int minIdxs[MAX_RANK];
         maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
 
         return getOffset(minShapeInfo, minIdxs);
     }
 
     //////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude) {
+    INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude) {
 
         const auto rankMin = shape::rank(minShapeInfo);
         const auto rankMax = shape::rank(maxShapeInfo);
@@ -4394,8 +4396,8 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
 
         const auto diff = rankMax - rankMin;     // the size of dimsToExclude is equal to diff
 
-        Nd4jLong* indices   = memBuff;
-        Nd4jLong* increment = memBuff + rankMax;
+        int* indices   = memBuff;
+        int* increment = memBuff + rankMax;
 
         int N, minI, maxI;
 
@@ -4457,7 +4459,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
     }
 
     //////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
+    INLINEDEF _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
 
         const auto rankMin = shape::rank(minShapeInfo);
         const auto rankMax = shape::rank(maxShapeInfo);
@@ -4469,9 +4471,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
 
         const auto diff    = rankMax - rankMin;     // the size of dimsToExclude is equal to diff
 
-        Nd4jLong buffer[MAX_RANK];
-        Nd4jLong* indices = buffer;
-        Nd4jLong* increment = buffer + MAX_RANK/2;
+        int indices[MAX_RANK], increment[MAX_RANK];
 
         int N, minI, maxI;
 
@@ -4886,7 +4886,7 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) {
 
     for(uint i = dimsSize - 1; i > 0; --i) {
         coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]];
@@ -4895,6 +4895,34 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,
     coords[tadDims[0]] = index;      // last iteration
 }
 
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords) {
+
+    if(startIndex == index) {
+        shape::index2coords(index, shapeInfo, coords);
+    }
+    else {
+        int axis = shapeInfo[0] - 1;
+        while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
+            coords[axis--] = 0;
+        ++coords[axis];
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords) {
+
+    if(startIndex == index) {
+        shape::index2coords(index, shapeInfo, coords);
+    }
+    else {
+        int axis = shapeInfo[0] - 1;
+        while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
+            coords[axis--] = 0;
+        ++coords[axis];
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
 
@@ -5131,23 +5159,23 @@ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo,
 
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
+// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
 
-    Nd4jLong result = 9223372036854775807LL;
+//     Nd4jLong result = 9223372036854775807LL;
 
-    for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
+//     for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
 
-        const auto currentStride = shape::stride(inShapeInfo)[i];
+//         const auto currentStride = shape::stride(inShapeInfo)[i];
 
-        if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
-            continue;
+//         if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
+//             continue;
 
-        if(result > currentStride)
-            result = currentStride;
-    }
+//         if(result > currentStride)
+//             result = currentStride;
+//     }
 
-    return result == 9223372036854775807LL ? 1 : result;
-}
+//     return result == 9223372036854775807LL ? 1 : result;
+// }
 
 
 
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index d69396e70..2b24dc17a 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -739,11 +739,11 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
 
             auto func = PRAGMA_THREADS_FOR{
 
-                Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+                int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
 
                 for (auto i = start; i < stop; ++i) {
 
-                    shape::index2coords(i, zShapeInfo, zCoords);
+                    shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
 
                     for (uint j = 0; j < rank; ++j) {
                         xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.hpp b/libnd4j/include/loops/cpu/broadcasting_bool.hpp
index 9f9e95f50..ef8a35c48 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.hpp
@@ -449,11 +449,11 @@ void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
 
             auto func = PRAGMA_THREADS_FOR{
 
-                Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+                int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
 
                 for (auto i = start; i < stop; ++i) {
 
-                    shape::index2coords(i, zShapeInfo, zCoords);
+                    shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
 
                     for (uint j = 0; j < rank; ++j) {
                         xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.hpp b/libnd4j/include/loops/cpu/broadcasting_int.hpp
index 9c5186f8c..95f54881d 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.hpp
@@ -609,11 +609,11 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
 
             auto func = PRAGMA_THREADS_FOR{
 
-                Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+                int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
 
                 for (auto i = start; i < stop; ++i) {
 
-                    shape::index2coords(i, zShapeInfo, zCoords);
+                    shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
 
                     for (uint j = 0; j < rank; ++j) {
                         xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp
index f54386975..49270ddcc 100644
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@@ -275,7 +275,7 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
 
     for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
 
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index 513db1b7c..aae6bb141 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -291,7 +291,7 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
 
     for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
 
diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu
index 651aaecc5..f9ad3218c 100644
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@@ -271,7 +271,7 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
 
     for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
 
diff --git a/libnd4j/include/loops/cuda/specials/concatKernel.cu b/libnd4j/include/loops/cuda/specials/concatKernel.cu
index 59c6c5380..a4a849e49 100644
--- a/libnd4j/include/loops/cuda/specials/concatKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernel.cu
@@ -137,7 +137,7 @@ namespace sd {
                     T *dataTAD = currentData + inputOffset;
                     T *resultTAD = result + resultOffset;
 
-                    Nd4jLong sub[MAX_RANK];
+                    int sub[MAX_RANK];
 
                     shape::index2coords(arrOffset, zTadShape, sub);
 
@@ -166,7 +166,7 @@ namespace sd {
                     auto dataTAD = currentData + inputOffset;
                     auto resultTAD = result + resultOffset;
 
-                    Nd4jLong sub[MAX_RANK];
+                    int sub[MAX_RANK];
 
                     shape::index2coords(arrOffset, zTadShape, sub);
                     Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
@@ -199,7 +199,7 @@ namespace sd {
                                     resultTAD[baseIdx + k * tadEWS] = dataTAD[k];
                                 }
                             } else {
-                                Nd4jLong yIdx[MAX_RANK];
+                                int yIdx[MAX_RANK];
                                 auto yRank = shape::rank(currentTad);
 
                                 for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
@@ -214,8 +214,8 @@ namespace sd {
                             //if (threadIdx.x == 0 && blockIdx.x  == 0)
                             //    printf("Branch C; yLength: %i;\n", yLength);
 
-                            Nd4jLong zIdx[MAX_RANK];
-                            Nd4jLong yIdx[MAX_RANK];
+                            int zIdx[MAX_RANK];
+                            int yIdx[MAX_RANK];
                             auto yRank = shape::rank(currentTad);
                             auto tadRank = shape::rank(zTadShape);
 
diff --git a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
index e835dc711..a59e3f02c 100644
--- a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
+++ b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
@@ -39,8 +39,7 @@ namespace sd {
             delim->syncToHost();
 
             // output rank N+1 wrt input rank
-            std::vector<Nd4jLong> ocoords(input->rankOf() + 1);
-            std::vector<Nd4jLong> icoords(input->rankOf());
+            std::vector<int> icoords(input->rankOf());
 
             // getting buffer lengths
             // FIXME: it'll be bigger, since it'll include delimiters,
@@ -54,7 +53,7 @@ namespace sd {
                 auto s = input->e<std::string>(e);
 
                 // getting base index
-                shape::index2coords(e, input->shapeInfo(), icoords.data());
+                shape::index2coordsCPU(0, e, input->shapeInfo(), icoords.data());
 
                 // getting number of substrings
                 auto cnt = StringUtils::countSubarrays(s.c_str(), s.length(), d.c_str(), d.length()) + 1;
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index a0e6cf061..2293fe843 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -64,7 +64,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
 
         Nd4jLong* xOffsets = new Nd4jLong[steps];
         Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
-        Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
+        int* auxBuff = new int[2 * input->rankOf()];
 
         for (Nd4jLong j = 0; j < lenSmall; ++j) {
 
@@ -139,40 +139,42 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
 
     auto func = PRAGMA_THREADS_FOR {
 
-        Nd4jLong coords[MAX_RANK];
+        int xzCoords[MAX_RANK], minCoords[MAX_RANK];
+
+        for (uint i = 0, j = 0; i < xRank; ++i)
+            if(j < numAxes && i != axes[j])
+                minCoords[i] = 0;
+            else
+                ++j;
 
         for (auto i = start; i < stop; i++) {
 
-            shape::index2coords(i, input->getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, input->getShapeInfo(), xzCoords);
 
-            const auto xOffset = shape::getOffset(input->getShapeInfo(), coords);
-            const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), coords);
+            const auto xOffset = shape::getOffset(input->getShapeInfo(), xzCoords);
+            const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), xzCoords);
 
             if(minRank == xRank) {
-                for (uint i = 0, j = 0; i < xRank; ++i) {
-                    if(j < numAxes && i != axes[j])
-                        coords[i] = 0;
-                    else
-                        ++j;
-                }
+                for (uint j = 0; j < numAxes; ++j)
+                    minCoords[axes[j]] = xzCoords[axes[j]];
             }
             else    // minRank = numAxes = 1 in this case
-                coords[0] = coords[axes[0]];
+                minCoords[0] = xzCoords[axes[0]];
 
-            const auto meanOffset     = shape::getOffset(mean->getShapeInfo(), coords);
-            const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), coords);
+            const auto meanOffset     = shape::getOffset(mean->getShapeInfo(), minCoords);
+            const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), minCoords);
 
             T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(v[varianceOffset] + epsilon);
 
             if(g != nullptr) {
-                const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), coords);
+                const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), minCoords);
                 sigmaInvGam *= g[gammaOffset];
             }
 
             z[zOffset] = (x[xOffset] - m[meanOffset]) * sigmaInvGam;
 
             if(b != nullptr) {
-                const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), coords);
+                const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), minCoords);
                 z[zOffset] += b[betaOffset];
             }
         }
@@ -184,7 +186,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
 //////////////////////////////////////////////////////////////////////////
 void batchnorm(const NDArray* input, const NDArray* mean, const NDArray* variance, const NDArray* gamma, const NDArray* beta, NDArray* output, const std::vector<int>& axes, const double epsilon) {
 
-    // batchnorm2_ is slower
+    // batchnorm2_ is still slower ?
     BUILD_SINGLE_SELECTOR(input->dataType(), batchnorm_, (input, mean, variance, gamma, beta, output, axes, epsilon), FLOAT_TYPES);
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
index 46729fbb8..682677ef3 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
@@ -51,19 +51,19 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
 
     auto func = PRAGMA_THREADS_FOR{
 
-         Nd4jLong coords[MAX_RANK];
-         for (auto i = start; i < stop; i++) {
-             shape::index2coords(i, output.getShapeInfo(), coords);
-             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
-             const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords);
-             const auto xOffset1 = xOffset0 + input.strideAt(dimC);
-             const auto xOffset2 = xOffset1 + input.strideAt(dimC);
-             z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2];
-         }
-     };
+        int coords[MAX_RANK];
+        for (auto i = start; i < stop; i++) {
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+            const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords);
+            const auto xOffset1 = xOffset0 + input.strideAt(dimC);
+            const auto xOffset2 = xOffset1 + input.strideAt(dimC);
+            z[zOffset] = 0.2989f*x[xOffset0] + 0.5870f*x[xOffset1] + 0.1140f*x[xOffset2];
+        }
+    };
 
-     samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1);
-     return;
+    samediff::Threads::parallel_for(func, 0, output.lengthOf(), 1);
+    return;
 }
 
 void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray& output, const int dimC) {
@@ -78,9 +78,9 @@ FORCEINLINE static void rgbToFromYuv_(const NDArray& input, NDArray& output, con
     const int rank = input.rankOf();
     bool bSimple = (dimC == rank - 1 && 'c' == input.ordering() && 1 == input.ews() &&
                      'c' == output.ordering() && 1 == output.ews());
-    
+
     if (bSimple) {
-        
+
         auto func = PRAGMA_THREADS_FOR{
             for (auto i = start; i < stop; i += increment) {
                 op(x[i], x[i + 1], x[i + 2], z[i], z[i + 1], z[i + 2]);
@@ -177,12 +177,12 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
 
     const T* x = input->bufferAsT<T>();
     T* z = output->bufferAsT<T>();
-    // TODO: Use tensordot or other optimizied helpers to see if we can get better performance. 
+    // TODO: Use tensordot or other optimizied helpers to see if we can get better performance.
 
     if (dimC == rank - 1 && input->ews() == 1 && output->ews() == 1 && input->ordering() == 'c' && output->ordering() == 'c') {
 
         auto func = PRAGMA_THREADS_FOR{
-            for (auto i = start; i < stop; i += increment) { 
+            for (auto i = start; i < stop; i += increment) {
                 //simple M*v //tr.T*v.T // v * tr  //rule: (AB)' =B'A'
                 // v.shape (1,3) row vector
                 T x0, x1, x2;
@@ -192,7 +192,7 @@ FORCEINLINE static void tripleTransformer(const NDArray* input, NDArray* output,
                 z[i]   = x0 * tr[0][0] + x1 * tr[1][0] + x2 * tr[2][0];
                 z[i+1] = x0 * tr[0][1] + x1 * tr[1][1] + x2 * tr[2][1];
                 z[i+2] = x0 * tr[0][2] + x1 * tr[1][2] + x2 * tr[2][2];
-                
+
             }
         };
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
index 3372950f2..60df150a9 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@@ -49,9 +49,12 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
     const auto xLen = input.lengthOf();
 
     auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+
+        int coords[MAX_RANK];
+
         for (Nd4jLong i = 0; i < xLen; ++i) {
-            shape::index2coords(i, xShapeInfo, coords);
+
+            shape::index2coordsCPU(start, i, xShapeInfo, coords);
 
             const auto xOffset = shape::getOffset(xShapeInfo, coords);
             const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index bbbb9199e..6a854bba8 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -113,18 +113,23 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
 
     // loop through input array
     auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+
+        int zCoords[MAX_RANK], xCoords[MAX_RANK];
+
         for (auto i = start; i < stop; i++) {
 
-            shape::index2coords(i, output.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
 
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+            memcpy(xCoords, zCoords, rank * sizeof(int));
 
             // evaluate spatial coordinates for x
             for (uint j = 1; j <= numOfSpatialDims; ++j)
-                coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
+                xCoords[j] += crop.e<uint>(j - 1, 0);       // add crop left
 
-            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
+            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
+
+            z[zOffset] = x[xOffset];
         }
     };
 
@@ -299,11 +304,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
 
     // loop through output array
     auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
-        for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, output.getShapeInfo(), coords);
 
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+        int zCoords[MAX_RANK], xCoords[MAX_RANK];
+
+        for (auto i = start; i < stop; i++) {
+
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
+
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
+
+            memcpy(xCoords, zCoords, rank * sizeof(int));
 
             bool within = true;
 
@@ -312,16 +322,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
                 const auto padLeft = padding.e<uint>(j - 1, 0);
                 const auto padRight = padding.e<uint>(j - 1, 1);
 
-                within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
+                within &= zCoords[j] >= padLeft && zCoords[j] < output.sizeAt(j) - padRight;
 
                 if (!within)
                     break;
 
-                coords[j] -= padLeft;       // get coordinates for x
+                xCoords[j] = zCoords[j] - padLeft;       // get coordinates for x
             }
 
             if (within)
-                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
             else
                 z[zOffset] = 0.f;
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
index 2d9250f9b..dd83a8618 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@@ -43,11 +43,11 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
 
     auto func = PRAGMA_THREADS_FOR {
 
-        Nd4jLong xCoords[MAX_RANK];
+        int xCoords[MAX_RANK];
 
         for (auto i = start; i < stop; i++) {
 
-            shape::index2coords(i, xShapeInfo, xCoords);
+            shape::index2coordsCPU(start, i, xShapeInfo, xCoords);
 
             const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)];
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
index d138d9892..2e30cdf0a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
@@ -96,14 +96,17 @@ namespace helpers {
 
                 auto func = PRAGMA_THREADS_FOR{
 
-                    Nd4jLong coords[MAX_RANK];
+                    int coords[MAX_RANK], temp;
+
                     for (auto i = start; i < stop; i += increment) {
 
-                        shape::index2coords(i, input.getShapeInfo(), coords);
+                        shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
                         const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
 
                         uint outArrIdx = 0;
 
+                        temp = coords[axis];
+
                         while (coords[axis] >= zDim) {
                             coords[axis] -= zDim;
                             ++outArrIdx;
@@ -112,6 +115,8 @@ namespace helpers {
                         T* z = outArrs[outArrIdx]->bufferAsT<T>();
                         const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
                         z[zOffset] = xBuff[xOffset];
+
+                        coords[axis] = temp;
                     }
                 };
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index fa3570879..7169cca4a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -188,24 +188,35 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
         const T padVal = padValue.e<T>(0);
 
         auto func = PRAGMA_THREADS_FOR {
-            Nd4jLong coords[MAX_RANK];
+
+            int zCoords[MAX_RANK], xCoords[MAX_RANK];
+
             for (auto i = start; i < stop; i++) {
-                shape::index2coords(i, output.getShapeInfo(), coords);
-                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+
+                shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
+                const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
+
+                memcpy(xCoords, zCoords, rank * sizeof(int));
 
                 bool within = true;
+
                 for (int j = rankMinusOne; j >= 0; --j) {
-                    if (xShape[j] == zShape[j]) continue;
+
+                    if (xShape[j] == zShape[j])
+                        continue;
+
                     const auto left = paddings.e<Nd4jLong>(j, 0);
-                    if (coords[j] < left || coords[j] >= left + xShape[j]) {
+
+                    if (zCoords[j] < left || zCoords[j] >= left + xShape[j]) {
                         within = false;
                         break;
                     }
-                    else { coords[j] = coords[j] - left; }
+                    else
+                        xCoords[j] = zCoords[j] - left;
                 }
 
                 if (within)
-                    z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+                    z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
                 else
                     z[zOffset] = padVal;
             }
@@ -219,20 +230,30 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
         const Nd4jLong shift2 = mode == 1 ? 2 : 1;         // REFLECT : SYMMETRIC
 
         auto func = PRAGMA_THREADS_FOR {
-            Nd4jLong coords[MAX_RANK];
+
+            int zCoords[MAX_RANK], xCoords[MAX_RANK];
+
             for (auto i = start; i < stop; i++) {
-                shape::index2coords(i, output.getShapeInfo(), coords);
-                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+
+                shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
+                const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
+
+                memcpy(xCoords, zCoords, rank * sizeof(int));
 
                 for (int j = rankMinusOne; j >= 0; --j) {
 
-                    if (xShape[j] == zShape[j]) continue;
-                    coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
-                    if (coords[j] < 0) coords[j] = -coords[j] - shift1;                // means fill from left
-                    else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
+                    if (xShape[j] == zShape[j])
+                        continue;
+
+                    xCoords[j] = zCoords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
+
+                    if (xCoords[j] < 0)
+                        xCoords[j] = -xCoords[j] - shift1;                // means fill from left
+                    else if (xCoords[j] >= xShape[j])
+                        xCoords[j] = 2 * xShape[j] - xCoords[j] - shift2; // means fill from right
                 }
 
-                const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
+                const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
                 z[zOffset] = x[xOffset];
             }
         };
@@ -562,45 +583,37 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
 
     const Nd4jLong zLen = output.lengthOf();
 
-    const int yLastDim = indices.sizeAt(-1);
+    const uint yLastDim = indices.sizeAt(-1);
+
+    const int diff = zRank - xRank;
+    const bool bEqual = yLastDim == xRank;
 
     auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK * 3];
+
+        int xCoords[MAX_RANK], zCoords[MAX_RANK], temp;
+
         for (auto i = start; i < stop; i++) {
-            Nd4jLong *zCoordStart, *xCoordStart;
 
-            if (yLastDim == xRank) {
-                zCoordStart = coords;
-                xCoordStart = coords;
-            } else if (zRank >= xRank) {
-                zCoordStart = coords;
-                xCoordStart = coords + zRank - xRank;
-            } else {
-                zCoordStart = coords + xRank - zRank;
-                xCoordStart = coords;
-            }
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
 
-            shape::index2coords(i, output.getShapeInfo(), zCoordStart);
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
 
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
+            temp = zCoords[yRank - 1];
+            zCoords[yRank - 1] = 0;
+            const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoords);
+            zCoords[yRank - 1] = temp;
 
-            // last y coordinate
-            uint coordToRestore;
-            if (yLastDim != xRank)
-                coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
+            if(bEqual)
+                memcpy(xCoords, zCoords, zRank * sizeof(int));
+            else if(diff >= 0)
+                memcpy(xCoords, zCoords + diff, xRank * sizeof(int));
+            else
+                memcpy(xCoords - diff, zCoords, zRank * sizeof(int));
 
-            zCoordStart[yRank - 1] = 0;
-            const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
+            for (uint j = 0; j < yLastDim; ++j)
+                xCoords[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
 
-            //restore z coordinate
-            if (yLastDim != xRank)
-                zCoordStart[yRank - 1] = coordToRestore;
-
-            // construct coordinates for x
-            for (int j = 0; j < yLastDim; ++j)
-                xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
-
-            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
+            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
 
             z[zOffset] = x[xOffset];
         }
@@ -1188,10 +1201,12 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
     else {
 
         auto func = PRAGMA_THREADS_FOR {
-            Nd4jLong inIdx[MAX_RANK];
-            Nd4jLong outIdx[MAX_RANK];
+
+            int inIdx[MAX_RANK], outIdx[MAX_RANK];
+
             for (auto i = start; i < stop; i++) {
-                shape::index2coords(i, output.getShapeInfo(), outIdx);
+
+                shape::index2coordsCPU(start, i, output.getShapeInfo(), outIdx);
 
                 for (int j = 0; j < rank; ++j) {
                     const Nd4jLong inLen = input.sizeAt(j);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
index 6f658c72e..4243c6e04 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@@ -52,7 +52,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
 	__syncthreads();
 
 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-	Nd4jLong coords[MAX_RANK];
+	int coords[MAX_RANK];
 
 	for (int i = tid; i < xzLen; i += blockDim.x * gridDim.x) {
     	shape::index2coords(i, xShapeInfo, coords);
@@ -124,7 +124,7 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 	__syncthreads();
 
 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-	Nd4jLong coords[MAX_RANK];
+	int coords[MAX_RANK];
 
 	for (int i = tid; i < inLen; i += totalThreads) {
     	shape::index2coords(i, inShapeInfo, coords);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
index dad5a5b06..0878a1c77 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
@@ -45,7 +45,7 @@ __global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo,
           X* z = reinterpret_cast<X*>(vz);
 
     __shared__ int rank, channelPosition, posOfNonUnityDim;
-    __shared__ Nd4jLong *sharedMem, len;
+    __shared__ Nd4jLong len, *sharedMem;
     __shared__ bool xzSameOffsets, xzAreSame;
 
     if (threadIdx.x == 0) {
@@ -130,7 +130,7 @@ void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bia
                               FLOAT_TYPES, FLOAT_TYPES);
     } else {
         // default case
-        const int threadsPerBlock = MAX_NUM_THREADS / 2;
+        const int threadsPerBlock = MAX_NUM_THREADS / 4;
         const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
         const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
index 5e113ff2f..2daac26c3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
@@ -124,7 +124,7 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
     }
     __syncthreads();
 
-    Nd4jLong coords[MAX_RANK];
+    int coords[MAX_RANK];
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
index 6a41a9fc4..10e1d132c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@@ -51,7 +51,7 @@ __global__ static void concatCuda(void* pVx,  void* pxShapeInfo, void* vz, Nd4jL
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    Nd4jLong coords[MAX_RANK];
+    int coords[MAX_RANK];
 
     for (uint64_t i = tid; i < zLen; i += totalThreads) {
         shape::index2coords(i, zShapeInfo, coords);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
index d02e99987..76ba2e1df 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@@ -706,7 +706,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
           T* z = reinterpret_cast<T*>(vz);
 
     __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
@@ -858,7 +858,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
     Nd4jLong coord2, coord3;
     __shared__ int rank, kHeff, kWeff, iH, iW, kProd;
-    __shared__ Nd4jLong *sharedMem, yLen;
+    __shared__ Nd4jLong yLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
@@ -1017,7 +1017,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
     Nd4jLong coord2, coord3, coord4;
     __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
-    __shared__ Nd4jLong *sharedMem, yLen;
+    __shared__ Nd4jLong yLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
@@ -1342,7 +1342,7 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
           T* z = reinterpret_cast<T*>(vz);
 
     __shared__ int rank, dimIH;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
@@ -1410,7 +1410,7 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
           T* z = reinterpret_cast<T*>(vz);
 
     __shared__ int rank, dimID;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
@@ -1480,7 +1480,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
 
     __shared__ int rank, dimIH;
     __shared__ uint factorH, factorW;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
@@ -1554,7 +1554,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
 
     __shared__ int rank, dimID;
     __shared__ uint factorD, factorH, factorW;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
index 38e8d0cca..d7694641c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
@@ -36,8 +36,8 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
     __shared__ const T* x;
     __shared__ const T* y;
     __shared__ 		 T* z;
-    __shared__ int rank;
-    __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
+    __shared__ int rank, *sharedMem;
+    __shared__ Nd4jLong lenWithoutLastDim, totalThreads;
 
     if (threadIdx.x == 0) {
     	x = reinterpret_cast<const T*>(vx);
@@ -45,7 +45,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
     	z = reinterpret_cast<T*>(vz);
 
         extern __shared__ unsigned char shmem[];
-        sharedMem    = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem    = reinterpret_cast<int*>(shmem);
         totalThreads = gridDim.x * blockDim.x;
 
         rank              = shape::rank(xShapeInfo);
@@ -106,7 +106,7 @@ void crossBatched(sd::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z
 
 	const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (x->lengthOf() / x->sizeAt(-1) + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = sizeof(Nd4jLong) * threadsPerBlock * x->rankOf() + 128;
+    const int sharedMem = sizeof(int) * threadsPerBlock * x->rankOf() + 128;
 
     PointersManager manager(context, "cross");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
index b318465a7..c05b5fb6d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
@@ -43,13 +43,13 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
     const X* y = reinterpret_cast<const X*>(vy);
           Z* z = reinterpret_cast<Z*>(vz);
 
-    __shared__ int xzRank, yRank;
+    __shared__ int xzRank, yRank, *sharedMem;
     __shared__ uint iH, iW, kH, kW;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         zLen = shape::length(zShapeInfo);
 
@@ -119,7 +119,7 @@ void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, ND
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(int) * threadsPerBlock  + 128;
 
     NDArray::prepareSpecialUse({output}, {input, weights});
     BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), dilation2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), weights->getSpecialBuffer(), weights->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), sH, sW, pH, pW, dH, dW), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
index e8b5e83c0..3600104e1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@@ -27,7 +27,7 @@ namespace sd {
             template <typename T>
             void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {
 
-                Nd4jLong xCoord[MAX_RANK];
+                int xCoord[MAX_RANK];
 
                 // each block of threads works on 1 input array
                 for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
index 5bce6c1b7..0dbca8c47 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
@@ -40,12 +40,12 @@ __global__ static void im2colCuda(const void *image, void *columns,
     const auto im  = reinterpret_cast<const T*>(image);
           auto col = reinterpret_cast<T*>(columns);
 
-    __shared__ Nd4jLong colLen, *sharedMem, iH, iW;
-    __shared__ int imRank, colRank;
+    __shared__ Nd4jLong colLen, iH, iW;
+    __shared__ int imRank, colRank, *sharedMem;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         colRank = 6;
         imRank  = 4;
@@ -81,7 +81,7 @@ __global__ static void im2colCuda(const void *image, void *columns,
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, sd::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) {
-    im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(Nd4jLong) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
+    im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
index 11ec1f46e..54f306ef7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
@@ -149,19 +149,19 @@ __global__ void rgbToGrsCuda(const void *vx, const Nd4jLong *xShapeInfo, void *v
 	const auto x = reinterpret_cast<const T*>(vx);
 		  auto z = reinterpret_cast<T*>(vz);
 
-	__shared__ Nd4jLong zLen, *sharedMem;
-	__shared__ int rank;	// xRank == zRank
+	__shared__ Nd4jLong zLen;
+	__shared__ int rank, *sharedMem;	// xRank == zRank
 
 	if (threadIdx.x == 0) {
 		extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
 		zLen = shape::length(zShapeInfo);
 		rank = shape::rank(zShapeInfo);
 	}
 	__syncthreads();
 
-	Nd4jLong* coords = sharedMem + threadIdx.x * rank;
+	auto coords = sharedMem + threadIdx.x * rank;
 
 	for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i +=  gridDim.x * blockDim.x) {
 
@@ -197,7 +197,7 @@ void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray&
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+    const int sharedMem = input.rankOf() * sizeof(int) * threadsPerBlock + 128;
 
 	NDArray::prepareSpecialUse({&output}, {&input});
 	BUILD_SINGLE_SELECTOR(input.dataType(), rgbToGrsCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), dimC), NUMERIC_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
index c771d12ff..e5773abf5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@@ -39,14 +39,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
     const auto y = reinterpret_cast<const T*>(vy);
           auto z = reinterpret_cast<T*>(vz);
 
-    __shared__ int xRank;       // xRank = zRank, xRank = yRank + 1
-    __shared__ Nd4jLong xLen, *sharedMem;   // xLen = zLen
+    __shared__ int xRank, *sharedMem;       // xRank = zRank, xRank = yRank + 1
+    __shared__ Nd4jLong xLen;   // xLen = zLen
     __shared__ bool areSameOffsets;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);    // shapes are definitely the same, but strides might not
 
@@ -56,7 +56,7 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
 
     __syncthreads();
 
-    auto coords = sharedMem + threadIdx.x * xRank;               // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
+    auto coords = sharedMem + threadIdx.x * xRank;               // we provide (xRank * sizeof(int) * threadIdx.x) amount of shared memory per each thread
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
@@ -86,7 +86,7 @@ void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArr
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * input.rankOf() + 128;
 
     PointersManager manager(context, "matrixSetDiag");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
index da61b0e48..5784699d0 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@@ -43,12 +43,12 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
     const auto x = reinterpret_cast<const T*>(vx);
           auto z = reinterpret_cast<T*>(vz);
 
-    __shared__ int rank;
-    __shared__ Nd4jLong zLen, *sharedMem;
+    __shared__ int rank, *sharedMem;
+    __shared__ Nd4jLong zLen;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         rank  = shape::rank(zShapeInfo);
         zLen  = shape::length(zShapeInfo);
@@ -103,7 +103,7 @@ void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& out
 
         const int threadsPerBlock = MAX_NUM_THREADS / 2;
         const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
 
         PointersManager manager(context, "batchToSpace");
 
@@ -138,13 +138,13 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape
     const auto y = reinterpret_cast<const Y*>(vy);
           auto z = reinterpret_cast<X*>(vz);
 
-    __shared__ int rank;
-    __shared__ Nd4jLong zLen, *sharedMem;
+    __shared__ int rank, *sharedMem;
+    __shared__ Nd4jLong zLen;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         rank  = shape::rank(zShapeInfo);
         zLen  = shape::length(zShapeInfo);
@@ -234,7 +234,7 @@ void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDAr
 
         const int threadsPerBlock = MAX_NUM_THREADS / 4;
         const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
 
         PointersManager manager(context, "batchToSpaceND");
 
@@ -264,12 +264,12 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
     const auto x = reinterpret_cast<const T*>(vx);
           auto z = reinterpret_cast<T*>(vz);
 
-    __shared__ int rank;
-    __shared__ Nd4jLong zLen, *sharedMem;
+    __shared__ int rank, *sharedMem;
+    __shared__ Nd4jLong zLen;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         rank  = shape::rank(zShapeInfo);
         zLen  = shape::length(zShapeInfo);
@@ -326,7 +326,7 @@ void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& out
 
         const int threadsPerBlock = MAX_NUM_THREADS / 2;
         const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
 
         PointersManager manager(context, "spaceToBatch");
 
@@ -364,13 +364,13 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape
     const auto y = reinterpret_cast<const Y*>(vy);
           auto z = reinterpret_cast<X*>(vz);
 
-    __shared__ int rank;    // xRank = zRank, yRank = 2;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+    __shared__ int rank, *sharedMem;    // xRank = zRank, yRank = 2;
+    __shared__ Nd4jLong zLen, totalThreads;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         rank  = shape::rank(zShapeInfo);
         zLen  = shape::length(zShapeInfo);
@@ -473,7 +473,7 @@ void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDAr
 
         const int threadsPerBlock = MAX_NUM_THREADS / 4;
         const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
 
         PointersManager manager(context, "spaceToBatchND");
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index 4e7360004..364ad83d2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -628,12 +628,12 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
           auto y = reinterpret_cast<Z*>(vy);
           auto z = reinterpret_cast<Z*>(vz);
 
-    __shared__ Nd4jLong xLen, *sharedMem;
-    __shared__ int xRank;   // xRank = zRank, yRank = xRank + 1
+    __shared__ Nd4jLong xLen;
+    __shared__ int xRank, *sharedMem;   // xRank = zRank, yRank = xRank + 1
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         xLen  = shape::length(xShapeInfo);
         xRank = shape::rank(xShapeInfo);
@@ -678,7 +678,7 @@ void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray&
 
     const int threadsPerBlock = MAX_NUM_THREADS / 2;
     const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = updates.rankOf() * sizeof(int) * threadsPerBlock  + 128;
 
     if(calcGrad) {
         NDArray::prepareSpecialUse({&updates}, {&indices});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/split.cu b/libnd4j/include/ops/declarable/helpers/cuda/split.cu
index 5c4b4b014..5690d786c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/split.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/split.cu
@@ -54,7 +54,7 @@ __global__ static void splitCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 
-    Nd4jLong coords[MAX_RANK];
+    int coords[MAX_RANK];
 
     for (uint64_t i = tid; i < xLen; i += totalThreads) {
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
index 0ded43593..518525ecf 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
@@ -135,13 +135,13 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
 
     const int rank = 3;
 
-    __shared__ int time, K;
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;
+    __shared__ int time, K, *sharedMem;
+    __shared__ Nd4jLong len, totalThreads;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         time = xShapeInfo[1];
         K    = xShapeInfo[3] / 2;
@@ -152,7 +152,7 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong* coords = sharedMem + threadIdx.x * rank;
+    auto coords = sharedMem + threadIdx.x * rank;
 
     if(tid >= len)
         return;
@@ -245,7 +245,7 @@ void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDAr
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock;      // loop through last two dimensions of x array -> bS, 2*K
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
 
     NDArray::prepareSpecialUse({ht, ct}, {x, &wi, b, c0, mask});
     BUILD_SINGLE_SELECTOR(x->dataType(), sruBICudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ht->specialBuffer(), ht->specialShapeInfo(), ct->specialBuffer(), ct->specialShapeInfo()), FLOAT_TYPES);
@@ -340,13 +340,13 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
 
     const int rank = 3;
 
-    __shared__ int time, K;
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;
+    __shared__ int time, K, *sharedMem;
+    __shared__ Nd4jLong len, totalThreads;
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         time = xShapeInfo[1];
         K    = xShapeInfo[3] / 2;
@@ -358,7 +358,7 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong* coords = sharedMem + threadIdx.x * rank;
+    auto coords = sharedMem + threadIdx.x * rank;
 
     if(tid >= len)
         return;
@@ -513,7 +513,7 @@ void sruBIBP(sd::LaunchContext* context, NDArray* x, const NDArray* w, const NDA
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock;      // loop through last two dimensions of x array -> bS, 2*K
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
 
     NDArray::prepareSpecialUse({gradI, &gradWi, &gradBias, gradC0}, {x, &wi, b, c0, ct, gradCt, gradHt, mask});
     BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ct->getSpecialBuffer(), ct->getSpecialShapeInfo(), gradHt->getSpecialBuffer(), gradHt->getSpecialShapeInfo(), gradCt->getSpecialBuffer(), gradCt->getSpecialShapeInfo(), gradI->specialBuffer(), gradI->specialShapeInfo(), gradWi.specialBuffer(), gradWi.specialShapeInfo(), gradBias.specialBuffer(), gradBias.specialShapeInfo(), gradC0->specialBuffer(), gradC0->specialShapeInfo()), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index 2c4299a90..b4dcfb2f6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -93,13 +93,13 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
           auto z = reinterpret_cast<T*>(vz);
 
     __shared__ T* sharedMem;
-    __shared__ int xRank, zRank;        // xRank = zRank + 2
-    __shared__ Nd4jLong xLen, zLen, *coordsMem;
+    __shared__ int xRank, zRank, *coordsMem;        // xRank = zRank + 2
+    __shared__ Nd4jLong xLen, zLen;
 
     if (threadIdx.x == 0) {
         extern __shared__ unsigned char shmem[];
         sharedMem = reinterpret_cast<T*>(shmem);
-        coordsMem = reinterpret_cast<Nd4jLong*>(shmem + blockDim.x * sizeof(T));
+        coordsMem = reinterpret_cast<int*>(shmem + blockDim.x * sizeof(T));
 
         xRank = shape::rank(xShapeInfo);
         zRank = shape::rank(zShapeInfo);
@@ -109,7 +109,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
     }
     __syncthreads();
 
-    Nd4jLong* coords = coordsMem + threadIdx.x * xRank;
+    auto coords = coordsMem + threadIdx.x * xRank;
 
     for (uint m = blockIdx.x; m < zLen; m += gridDim.x) {   // one block per each element of z, that is per each matrix
 
@@ -160,7 +160,7 @@ void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
     const uint diagLen = input.sizeAt(-1) < input.sizeAt(-2) ? input.sizeAt(-1) : input.sizeAt(-2);
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * (sizeof(Nd4jLong) * input.rankOf() + input.sizeOfT()) + 128;
+    const int sharedMem = threadsPerBlock * (sizeof(int) * input.rankOf() + input.sizeOfT()) + 128;
 
     NDArray::prepareSpecialUse({&output}, {&input});
     BUILD_SINGLE_SELECTOR(input.dataType(), traceCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), diagLen), LIBND4J_TYPES);
@@ -177,13 +177,13 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
     const auto x = reinterpret_cast<const T*>(vx);  // gradO
           auto z = reinterpret_cast<T*>(vz);        // gradI
 
-    __shared__ int rank, areSameOffsets;                // xRank = zRank
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;  // xLen = zLen
+    __shared__ int rank, areSameOffsets, *sharedMem;                // xRank = zRank
+    __shared__ Nd4jLong len, totalThreads;  // xLen = zLen
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
         areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
         rank = shape::rank(xShapeInfo);
         len  = shape::length(zShapeInfo);
@@ -221,7 +221,7 @@ void triuBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gra
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * gradO.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * gradO.rankOf() + 128;
 
     PointersManager manager(context, "triuBP");
 
@@ -240,13 +240,13 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
     const auto x = reinterpret_cast<const T*>(vx);  // gradO
           auto z = reinterpret_cast<T*>(vz);        // gradI
 
-    __shared__ int xRank, zRank;                // xRank >= zRank
-    __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads, *sharedMem;  // xLen >= zLen
+    __shared__ int xRank, zRank, *sharedMem;                // xRank >= zRank
+    __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads;  // xLen >= zLen
 
     if (threadIdx.x == 0) {
 
         extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 
         xRank = shape::rank(zShapeInfo);
         zLen  = shape::length(zShapeInfo);
@@ -289,7 +289,7 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray
 
     const int threadsPerBlock = MAX_NUM_THREADS / 4;
     const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 2 * gradO.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * 2 * gradO.rankOf() + 128;
 
     PointersManager manager(context, "tileBP");
 
diff --git a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
index 7a99b4ecc..36044907e 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
@@ -32,7 +32,7 @@ namespace sd {
                 auto indices = reinterpret_cast<const I*>(vindices);
                 auto output = reinterpret_cast<X*>(voutput);
 
-                Nd4jLong coords[MAX_RANK];
+                int coords[MAX_RANK];
                 uint64_t pos = 0;
                 for (uint64_t e = 0L; e < length; e++) {
                     // indices come in blocks
diff --git a/libnd4j/include/ops/declarable/helpers/impl/where.cpp b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
index f73bcae9a..df8fd1074 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/where.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
@@ -29,11 +29,14 @@ namespace sd {
                 NDArrayList list(0, true);
                 int cnt = 0;
 
-                Nd4jLong idx[MAX_RANK];
+                int idx[MAX_RANK];
+
                 for (Nd4jLong e = 0; e < condition.lengthOf(); e++) {
-                    shape::index2coords(e, condition.getShapeInfo(), idx);
+
+                    shape::index2coordsCPU(0, e, condition.getShapeInfo(), idx);
 
                     auto offset = shape::getOffset(condition.getShapeInfo(), idx);
+
                     if (condition.e<bool>(offset)) {
                         auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext());
                         for (int f = 0; f < condition.rankOf(); f++)
diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp
index 3cf3d113e..ed86315f7 100644
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@@ -178,16 +178,18 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
     // general case
     auto func = PRAGMA_THREADS_FOR {
 
-        Nd4jLong coords[MAX_RANK];
+        int coords[MAX_RANK], temp;
 
         for (auto i = start; i < stop; i += increment) {
 
-            shape::index2coords(i, output.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
+
             const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
 
             uint inArrIdx = 0;
             uint xDim = inArrs[inArrIdx]->sizeAt(axis);
 
+            temp = coords[axis];
             while (coords[axis] >= xDim) {
                 coords[axis] -= xDim;
                 xDim = inArrs[++inArrIdx]->sizeAt(axis);
@@ -197,6 +199,8 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
             const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
 
             zBuff[zOffset] = x[xOffset];
+
+            coords[axis] = temp;
         }
     };
 
@@ -298,13 +302,15 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
 
     auto func = PRAGMA_THREADS_FOR{
 
-        Nd4jLong coords[MAX_RANK];
+        int coords[MAX_RANK], temp;
+
         for (auto i = start; i < stop; i += increment) {
 
-            shape::index2coords(i, input.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
             const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
 
             uint outArrIdx = 0;
+            temp = coords[axis];
 
             while (coords[axis] >= zDim) {
                 coords[axis] -= zDim;
@@ -314,6 +320,8 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
             T* z = outArrs[outArrIdx]->bufferAsT<T>();
             const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
             z[zOffset] = xBuff[xOffset];
+
+            coords[axis] = temp;
         }
     };
 
diff --git a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
index 0769a9aef..94156d4bc 100644
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@@ -258,6 +258,7 @@ TEST_F(PlaygroundTests, test_bert_2) {
     delete graph;
 }
 
+
 TEST_F(PlaygroundTests, test_one_off_ops_1) {
     auto x = NDArrayFactory::create<float>('c', {4, 128, 768});
     auto y = NDArrayFactory::create<float>('c', {4, 128, 1});
diff --git a/libnd4j/tests_cpu/layers_tests/TadTests.cpp b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
index 421e71b01..5dfdf401d 100644
--- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
@@ -289,7 +289,7 @@ TEST_F(TadTests, calcOffsets_1) {
 TEST_F(TadTests, outerArrayIndexes_1) {
 
     NDArray x('c', {2,3,4,5}, sd::DataType::FLOAT32);
-    Nd4jLong maxIdxs[120];
+    int maxIdxs[120];
 
     NDArray y1('c', {3,5}, sd::DataType::FLOAT32);
     const std::vector<int> dimsToExclude1 = {0,2};