[WIP] Shyrma coords (#305)

* - provide faster index2coords function for cpu Signed-off-by: Yurii <iuriish@yahoo.com> * - new faster index2coords function is introduced into cpu code Signed-off-by: Yurii <iuriish@yahoo.com> * - replace long long coordinates with int coordinates Signed-off-by: Yurii <iuriish@yahoo.com> * - add missed reload of coords2index function Signed-off-by: Yurii <iuriish@yahoo.com> * - reststart jenkins Signed-off-by: Yurii <iuriish@yahoo.com> * - rollback changes in convolutions.cu and addBias.cu Signed-off-by: Yurii <iuriish@yahoo.com>
2020-03-11 15:21:59 +02:00 · 2020-03-11 15:21:59 +02:00 · 58550b7c98
commit 58550b7c98
parent 50b7d82b96
43 changed files with 359 additions and 273 deletions
--- a/libnd4j/include/array/cpu/NDArray.cpp
+++ b/libnd4j/include/array/cpu/NDArray.cpp
@ -95,22 +95,29 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
    const bool areSameOffsets = shape::haveSameShapeAndStrides(getShapeInfo(), target.getShapeInfo());
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+
        int coords[MAX_RANK], temp;
        for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, target.getShapeInfo(), coords);
+
            shape::index2coordsCPU(start, i, target.getShapeInfo(), coords);
            const auto zOffset = shape::getOffset(target.getShapeInfo(), coords);
            // if( (row + upper < col) || (row + lower > col) )
            if ((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
                z[zOffset] = value;
            else if (this != &target) {      // when this and target are different arrays
-                if (xRank != zRank)
+                if (xRank != zRank) {
                    temp = coords[0];
                    coords[0] = coords[1];
                }
                const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords);
                z[zOffset] = x[xOffset];
                if (xRank != zRank)     // restore first coordinate
                    coords[0] = temp;
            }
        }
    };
@ -376,12 +383,16 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
    // loop through input array
    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
        for (auto i = start; i < stop; i++) {
            shape::index2coords(i, output.getShapeInfo(), coords);
        int coords[MAX_RANK], temp;
        for (auto i = start; i < stop; i++) {
            shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
            temp = coords[axis];
            if (repSize > 1) {
                for (uint j = 0; j < repSize; ++j) {
                    coords[axis] -= repeats[j];
@ -394,6 +405,8 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
                coords[axis] /= repeats[0];
            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
            coords[axis] = temp;
        }
    };
--- a/libnd4j/include/array/cuda/NDArray.cu
+++ b/libnd4j/include/array/cuda/NDArray.cu
@ -85,12 +85,12 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
    const auto x = reinterpret_cast<const T*>(vx);
          auto z = reinterpret_cast<T*>(vz);
-    __shared__ int zRank, xRank, areSameOffsets;        // xRank == zRank always, except when xRank = 1, in this case zRank = 2
+    __shared__ int zRank, xRank, areSameOffsets, *sharedMem;    // xRank == zRank always, except when xRank = 1, in this case zRank = 2
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
+    __shared__ Nd4jLong zLen, totalThreads;                     // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
        xRank = shape::rank(xShapeInfo);
        zRank = shape::rank(zShapeInfo);
@ -137,7 +137,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, NDArray& t
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(decltype(*target.getShapeInfo())) * target.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * target.rankOf() + 128;
    PointersManager manager(getContext(), "NDArray::fillAsTriangular");
@ -155,12 +155,12 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
    auto x = reinterpret_cast<T*>(vx);
-    __shared__ int rank;
+    __shared__ int rank, *sharedMem;
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
+    __shared__ Nd4jLong len, totalThreads;  // xLen == zLen, except when xRank = 1, in this case zLen = 2*xLen
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        rank = shape::rank(xShapeInfo);
        len  = shape::length(xShapeInfo);
        totalThreads = gridDim.x * blockDim.x;
@ -201,7 +201,7 @@ void NDArray::setIdentity() {
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(decltype(getShapeInfo())) * rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * rankOf() + 128;
    PointersManager manager(getContext(), "NDArray::setIdentity");
@ -398,13 +398,13 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
    const X* x = reinterpret_cast<const X*>(vx);
          Z* z = reinterpret_cast<Z*>(vz);
-    __shared__ int rank;
+    __shared__ int rank, *sharedMem;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;  // xLen = zLen
+    __shared__ Nd4jLong zLen, totalThreads;  // xLen = zLen
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        rank = shape::rank(zShapeInfo);     // xRank = zRank
        zLen = shape::length(zShapeInfo);    // xLen <= zLen
@ -460,7 +460,7 @@ NDArray NDArray::repeat(const int axis, const std::vector<int>& repeats) const {
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = output.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = output.rankOf() * sizeof(int) * threadsPerBlock  + 128;
    PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
@ -484,7 +484,7 @@ void NDArray::repeat(const int axis, const std::vector<int>& repeats, NDArray& t
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
    const int blocksPerGrid = (target.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = target.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = target.rankOf() * sizeof(int) * threadsPerBlock  + 128;
    PointersManager manager(getContext(), "NDArray::repeat(const int axis, const std::vector<int>& repeats)");
--- a/libnd4j/include/helpers/benchmark/ParametersBatch.h
+++ b/libnd4j/include/helpers/benchmark/ParametersBatch.h
@ -46,7 +46,7 @@ namespace sd {
            int totalIterations = 1;
            // hehe
-            Nd4jLong xCoords[MAX_RANK];
+            int xCoords[MAX_RANK];
            Nd4jLong xShape[MAX_RANK];
            int xRank = _spaces.size();
--- a/libnd4j/include/helpers/cpu/MmulHelper.cpp
+++ b/libnd4j/include/helpers/cpu/MmulHelper.cpp
@ -63,7 +63,7 @@ static  void usualGemm(const NDArray* vA, const NDArray* vB, NDArray* vC,
        for (auto i = start; i < stop; ++i) {
            // evaluate C coordinates
-            shape::index2coords(i, cShapeInfo, cCoords.data());
+            shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
            // evaluate A coordinates
            aCoords[aMaxis] = cCoords[cMaxis];
@ -433,12 +433,12 @@ static void batchedGemm(const NDArray* vA, const NDArray* vB,  NDArray* vC,
    auto func = PRAGMA_THREADS_FOR {
-        std::vector<Nd4jLong> aCoords(aRank), bCoords(bRank), cCoords(cRank);
+        std::vector<int> aCoords(aRank), bCoords(bRank), cCoords(cRank);
        for (auto i = start; i < stop; ++i) {
            // evaluate C coordinates
-            shape::index2coords(i, cShapeInfo, cCoords.data());
+            shape::index2coordsCPU(start, i, cShapeInfo, cCoords.data());
            // calculate index of current batch
            Nd4jLong batchInd;
--- a/libnd4j/include/helpers/cuda_off/MmulHelper.cu
+++ b/libnd4j/include/helpers/cuda_off/MmulHelper.cu
@ -40,15 +40,15 @@ static __global__ void usualCudaGemm(const void* vA, const Nd4jLong* aShapeInfo,
    const T2* B = reinterpret_cast<const T2*>(vB);
          T3* C = reinterpret_cast<      T3*>(vC);
-    __shared__ int K;
+    __shared__ int K, *coords;
    __shared__ bool betaPresent;
-    __shared__ Nd4jLong cLen, totalThreads, *coords;
+    __shared__ Nd4jLong cLen, totalThreads;
    __shared__ T3 alphaZ, betaZ;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        coords = reinterpret_cast<Nd4jLong*>(shmem);
+        coords = reinterpret_cast<int*>(shmem);
        cLen = shape::length(cShapeInfo);
        K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
@ -263,7 +263,7 @@ NDArray* MmulHelper::mmulMxM(const NDArray* A, const NDArray* B, NDArray* C, dou
        const int threadsPerBlock = MAX_NUM_THREADS / 2;
        const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 6 + 128;                             // 6 = aRank + bRank + cRank
+        const int sharedMem = threadsPerBlock * sizeof(int) * 6 + 128;                             // 6 = aRank + bRank + cRank
        NDArray::prepareSpecialUse({C}, {A, B});
        // BUILD_TRIPLE_SELECTOR(aType, bType, cType, usualGemm, (blocksPerGrid, threadsPerBlock, sharedMem, stream, A->getSpecialBuffer(), A->getSpecialShapeInfo(), B->getSpecialBuffer(), B->getSpecialShapeInfo(), C->getSpecialBuffer(), C->getSpecialShapeInfo(), 0, 1, 0, 1, 0, 1, alpha, beta), NUMERIC_TYPES, NUMERIC_TYPES, FLOAT_TYPES);
@ -529,14 +529,14 @@ static __global__ void batchedCudaGemm(const void* vA, const Nd4jLong* aShapeInf
          T3* C = reinterpret_cast<      T3*>(vC);
    __shared__ bool betaPresent;
-    __shared__ int aRank, bRank, cRank, K;
+    __shared__ int aRank, bRank, cRank, K, *coords;
-    __shared__ Nd4jLong cLen, totalThreads, *coords;
+    __shared__ Nd4jLong cLen, totalThreads;
    __shared__ T3 alphaZ, betaZ;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        coords = reinterpret_cast<Nd4jLong*>(shmem);
+        coords = reinterpret_cast<int*>(shmem);
        cLen = shape::length(cShapeInfo);
        K = shape::shapeOf(const_cast<Nd4jLong*>(aShapeInfo))[aKaxis];
@ -649,7 +649,7 @@ NDArray* MmulHelper::mmulNxN(const NDArray* A, const NDArray* B, NDArray* C, con
    const int threadsPerBlock = MAX_NUM_THREADS / 8;
    const int blocksPerGrid = (C->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * (aRank + bRank + cRank) + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * (aRank + bRank + cRank) + 128;
    PointersManager manager(A->getContext(), "MmulHelper::mmulNxN");
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@ -306,7 +306,6 @@ std::vector<Nd4jLong> ShapeUtils::evalRepeatShape(int axis, const std::vector<in
    if(repeats.size() == 1)
        outShape[axis] *= repeats[0];
    else
        outShape[axis] = std::accumulate(repeats.begin(), repeats.end(), 0);
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@ -915,12 +915,14 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,  uint *coords);
    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, int *coords);
    ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords);
    ND4J_EXPORT _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords);
    /**
    * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
    */
-    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims);
    /**
    * Convert coordinates to the corresponding linear index (sequence number in other words)
@ -929,11 +931,11 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords);
    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords);
    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *coords);
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *coords);
    /**
    * take into account only dimensions stored in tadDims, tadDims must be sorted in increasing order!
    */
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims);
   /**
   * increment n-dimensional array by one iteration by changing coord appropriately
@ -988,17 +990,17 @@ namespace shape {
    // function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
    // dimsToExclude - should be sorted in increasing order
    // dimsLen - length of dimsToExclude, if not set (= -1), then it is calculated as maxRank - minRank
-    ND4J_EXPORT _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
+    ND4J_EXPORT _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr, const int dimsLen = -1);
    // calculate indexes of max-array, these output indexes correspond to one minIdx index of min-array which is sub-array of max-array
    // dimsToExclude - should be sorted in increasing order
-    ND4J_EXPORT _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
+    ND4J_EXPORT _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude = nullptr);
    // calculate offsets of max-array, these offsets correspond to one minIdx index of min-array which is sub-array of max-array
    // maxOffsets - will contain calculated offsets of max-array, buffer for maxOffsets should be allocated beforehand
    // dimsToExclude - should be sorted in increasing order
    // memBuff - auxiliary memory buffer (size = 2 * max_rank) for coordinates and increments storing, should be allocated beforehand
-    ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude = nullptr);
+    ND4J_EXPORT _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude = nullptr);
    // calculates offsets for entities (elements or sub-arrays), shape in context of sub-array means dimensions excluded from outer array
    // rank is equal to size of shape
@ -1064,7 +1066,7 @@ namespace shape {
    * get stride over contiguous axis (contiguous axis must have stride = 1)
    * for example when inShapeInfo is {4, 2,5,4,3,  60,1,5,20, 16384,0,99} then output is 5 (that is smallest stride in inShapeInfo except those equal to 1)
    */
-    INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
+    // INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo);
@ -1832,7 +1834,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const uint *
 }
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const int *indices) {
    Nd4jLong index, shift = 1;;
@ -1845,7 +1847,7 @@ INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape,
    return index;
 }
-INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const int *coords, const int dimsSize, const int* tadDims) {
    Nd4jLong index, shift = 1;;
@ -4276,7 +4278,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
 // max array is outer for min array, min array is sub-array of max array
 // function calculates the coordinates of min array (and saves them into minIdxs) given coordinates of max array (already stored in maxIdxs)
-INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
+INLINEDEF _CUDA_HD void maxIndToMinInd(int* maxIdxs, int* minIdxs, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, int dimsLen) {
    const auto maxRank = shape::rank(maxShapeInfo);
    const auto minRank = shape::rank(minShapeInfo);
@ -4362,10 +4364,10 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
    //////////////////////////////////////////////////////////////////////
    INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
-        Nd4jLong maxIdxs[MAX_RANK];
+        int maxIdxs[MAX_RANK];
        shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
-        Nd4jLong minIdxs[MAX_RANK];
+        int minIdxs[MAX_RANK];
        maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
        return shape::coords2index(minShapeInfo, minIdxs);
@ -4374,17 +4376,17 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
    //////////////////////////////////////////////////////////////////////
    INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
-        Nd4jLong maxIdxs[MAX_RANK];
+        int maxIdxs[MAX_RANK];
        shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
-        Nd4jLong minIdxs[MAX_RANK];
+        int minIdxs[MAX_RANK];
        maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
        return getOffset(minShapeInfo, minIdxs);
    }
    //////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, Nd4jLong* memBuff, const int* dimsToExclude) {
+    INLINEDEF _CUDA_HD int outerArrayOffsets(Nd4jLong* maxOffsets, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, int* memBuff, const int* dimsToExclude) {
        const auto rankMin = shape::rank(minShapeInfo);
        const auto rankMax = shape::rank(maxShapeInfo);
@ -4394,8 +4396,8 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
        const auto diff = rankMax - rankMin;     // the size of dimsToExclude is equal to diff
-        Nd4jLong* indices   = memBuff;
+        int* indices   = memBuff;
-        Nd4jLong* increment = memBuff + rankMax;
+        int* increment = memBuff + rankMax;
        int N, minI, maxI;
@ -4457,7 +4459,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
    }
    //////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD int outerArrayIndexes(Nd4jLong* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
+    INLINEDEF _CUDA_HD int outerArrayIndexes(int* maxIdxs, const Nd4jLong minIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude) {
        const auto rankMin = shape::rank(minShapeInfo);
        const auto rankMax = shape::rank(maxShapeInfo);
@ -4469,9 +4471,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
        const auto diff    = rankMax - rankMin;     // the size of dimsToExclude is equal to diff
-        Nd4jLong buffer[MAX_RANK];
+        int indices[MAX_RANK], increment[MAX_RANK];
        Nd4jLong* indices = buffer;
        Nd4jLong* increment = buffer + MAX_RANK/2;
        int N, minI, maxI;
@ -4886,7 +4886,7 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jL
 }
 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords, const int dimsSize, const int* tadDims) {
+INLINEDEF _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, int *coords, const int dimsSize, const int* tadDims) {
    for(uint i = dimsSize - 1; i > 0; --i) {
        coords[tadDims[i]] = index % shapeInfo[1 + tadDims[i]];
@ -4895,6 +4895,34 @@ INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,
    coords[tadDims[0]] = index;      // last iteration
 }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, Nd4jLong *coords) {
    if(startIndex == index) {
        shape::index2coords(index, shapeInfo, coords);
    }
    else {
        int axis = shapeInfo[0] - 1;
        while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
            coords[axis--] = 0;
        ++coords[axis];
    }
 }
 //////////////////////////////////////////////////////////////////////
 INLINEDEF _CUDA_HD void index2coordsCPU(const Nd4jLong& startIndex, const Nd4jLong& index, const Nd4jLong *shapeInfo, int *coords) {
    if(startIndex == index) {
        shape::index2coords(index, shapeInfo, coords);
    }
    else {
        int axis = shapeInfo[0] - 1;
        while(coords[axis] == shape::sizeAt(shapeInfo, axis) - 1)
            coords[axis--] = 0;
        ++coords[axis];
    }
 }
 //////////////////////////////////////////////////////////////////////
 // INLINEDEF _CUDA_HD void calcOffsets(const Nd4jLong *xShapeInfo, Nd4jLong*& xOffsets, const Nd4jLong *yShapeInfo, Nd4jLong*& yOffsets, const Nd4jLong* zShapeInfo, Nd4jLong*& zOffsets, const char order) {
@ -5131,23 +5159,23 @@ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo,
 //////////////////////////////////////////////////////////////////////
-INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
+// INLINEDEF _CUDA_HD Nd4jLong strideOverContigAxis(const int axis, const Nd4jLong* inShapeInfo) {
-    Nd4jLong result = 9223372036854775807LL;
+//     Nd4jLong result = 9223372036854775807LL;
-    for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
+//     for(uint i = 0; i < shape::rank(inShapeInfo); ++i) {
-        const auto currentStride = shape::stride(inShapeInfo)[i];
+//         const auto currentStride = shape::stride(inShapeInfo)[i];
-        if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
+//         if(i == axis || shape::shapeOf(inShapeInfo)[i] == 1)
-            continue;
+//             continue;
-        if(result > currentStride)
+//         if(result > currentStride)
-            result = currentStride;
+//             result = currentStride;
-    }
+//     }
-    return result == 9223372036854775807LL ? 1 : result;
+//     return result == 9223372036854775807LL ? 1 : result;
-}
+// }
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@ -739,11 +739,11 @@ void Broadcast<X, Y, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo, const
            auto func = PRAGMA_THREADS_FOR{
-                Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+                int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
                for (auto i = start; i < stop; ++i) {
-                    shape::index2coords(i, zShapeInfo, zCoords);
+                    shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
                    for (uint j = 0; j < rank; ++j) {
                        xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
--- a/libnd4j/include/loops/cpu/broadcasting_bool.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.hpp
@ -449,11 +449,11 @@ void BroadcastBool<X, Z>::exec(const void *vx, const Nd4jLong *xShapeInfo,
            auto func = PRAGMA_THREADS_FOR{
-                Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+                int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
                for (auto i = start; i < stop; ++i) {
-                    shape::index2coords(i, zShapeInfo, zCoords);
+                    shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
                    for (uint j = 0; j < rank; ++j) {
                        xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
--- a/libnd4j/include/loops/cpu/broadcasting_int.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.hpp
@ -609,11 +609,11 @@ void BroadcastInt<X>::exec(const void *vx, const Nd4jLong *xShapeInfo,
            auto func = PRAGMA_THREADS_FOR{
-                Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+                int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
                for (auto i = start; i < stop; ++i) {
-                    shape::index2coords(i, zShapeInfo, zCoords);
+                    shape::index2coordsCPU(start, i, zShapeInfo, zCoords);
                    for (uint j = 0; j < rank; ++j) {
                        xCoords[j] = shape::sizeAt(xShapeInfo, j) == 1 ? 0 : zCoords[j];
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@ -275,7 +275,7 @@ __device__ void Broadcast<X,Y,Z>::transformCuda(
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
    for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@ -291,7 +291,7 @@ __device__ void BroadcastBool<X,Z>::transformCuda(const void *vx, const Nd4jLong
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
    for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@ -271,7 +271,7 @@ __device__ void BroadcastInt<X>::transformCuda(const void *vx, const Nd4jLong *x
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
+    int xCoords[MAX_RANK], yCoords[MAX_RANK], zCoords[MAX_RANK];
    for (int i = tid; i < zLen; i += blockDim.x * gridDim.x) {
--- a/libnd4j/include/loops/cuda/specials/concatKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernel.cu
@ -137,7 +137,7 @@ namespace sd {
                    T *dataTAD = currentData + inputOffset;
                    T *resultTAD = result + resultOffset;
-                    Nd4jLong sub[MAX_RANK];
+                    int sub[MAX_RANK];
                    shape::index2coords(arrOffset, zTadShape, sub);
@ -166,7 +166,7 @@ namespace sd {
                    auto dataTAD = currentData + inputOffset;
                    auto resultTAD = result + resultOffset;
-                    Nd4jLong sub[MAX_RANK];
+                    int sub[MAX_RANK];
                    shape::index2coords(arrOffset, zTadShape, sub);
                    Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
@ -199,7 +199,7 @@ namespace sd {
                                    resultTAD[baseIdx + k * tadEWS] = dataTAD[k];
                                }
                            } else {
-                                Nd4jLong yIdx[MAX_RANK];
+                                int yIdx[MAX_RANK];
                                auto yRank = shape::rank(currentTad);
                                for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
@ -214,8 +214,8 @@ namespace sd {
                            //if (threadIdx.x == 0 && blockIdx.x  == 0)
                            //    printf("Branch C; yLength: %i;\n", yLength);
-                            Nd4jLong zIdx[MAX_RANK];
+                            int zIdx[MAX_RANK];
-                            Nd4jLong yIdx[MAX_RANK];
+                            int yIdx[MAX_RANK];
                            auto yRank = shape::rank(currentTad);
                            auto tadRank = shape::rank(zTadShape);
--- a/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
+++ b/libnd4j/include/ops/declarable/generic/compat/compat_string_split.cpp
@ -39,8 +39,7 @@ namespace sd {
            delim->syncToHost();
            // output rank N+1 wrt input rank
-            std::vector<Nd4jLong> ocoords(input->rankOf() + 1);
+            std::vector<int> icoords(input->rankOf());
            std::vector<Nd4jLong> icoords(input->rankOf());
            // getting buffer lengths
            // FIXME: it'll be bigger, since it'll include delimiters,
@ -54,7 +53,7 @@ namespace sd {
                auto s = input->e<std::string>(e);
                // getting base index
-                shape::index2coords(e, input->shapeInfo(), icoords.data());
+                shape::index2coordsCPU(0, e, input->shapeInfo(), icoords.data());
                // getting number of substrings
                auto cnt = StringUtils::countSubarrays(s.c_str(), s.length(), d.c_str(), d.length()) + 1;
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@ -64,7 +64,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
        Nd4jLong* xOffsets = new Nd4jLong[steps];
        Nd4jLong* zOffsets = xzSameOffset ? xOffsets : new Nd4jLong[steps];
-        Nd4jLong* auxBuff = new Nd4jLong[2 * input->rankOf()];
+        int* auxBuff = new int[2 * input->rankOf()];
        for (Nd4jLong j = 0; j < lenSmall; ++j) {
@ -139,40 +139,42 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+        int xzCoords[MAX_RANK], minCoords[MAX_RANK];
        for (uint i = 0, j = 0; i < xRank; ++i)
            if(j < numAxes && i != axes[j])
                minCoords[i] = 0;
            else
                ++j;
        for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, input->getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, input->getShapeInfo(), xzCoords);
-            const auto xOffset = shape::getOffset(input->getShapeInfo(), coords);
+            const auto xOffset = shape::getOffset(input->getShapeInfo(), xzCoords);
-            const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), coords);
+            const auto zOffset = xzSameOffset ? xOffset : shape::getOffset(output->getShapeInfo(), xzCoords);
            if(minRank == xRank) {
-                for (uint i = 0, j = 0; i < xRank; ++i) {
+                for (uint j = 0; j < numAxes; ++j)
-                    if(j < numAxes && i != axes[j])
+                    minCoords[axes[j]] = xzCoords[axes[j]];
                        coords[i] = 0;
                    else
                        ++j;
                }
            }
            else    // minRank = numAxes = 1 in this case
-                coords[0] = coords[axes[0]];
+                minCoords[0] = xzCoords[axes[0]];
-            const auto meanOffset     = shape::getOffset(mean->getShapeInfo(), coords);
+            const auto meanOffset     = shape::getOffset(mean->getShapeInfo(), minCoords);
-            const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), coords);
+            const auto varianceOffset = paramSameOffset ? meanOffset : shape::getOffset(variance->getShapeInfo(), minCoords);
            T sigmaInvGam = 1. / sd::math::nd4j_sqrt<T, T>(v[varianceOffset] + epsilon);
            if(g != nullptr) {
-                const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), coords);
+                const auto gammaOffset = paramSameOffset ? meanOffset : shape::getOffset(gamma->getShapeInfo(), minCoords);
                sigmaInvGam *= g[gammaOffset];
            }
            z[zOffset] = (x[xOffset] - m[meanOffset]) * sigmaInvGam;
            if(b != nullptr) {
-                const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), coords);
+                const auto betaOffset = paramSameOffset ? meanOffset : shape::getOffset(beta->getShapeInfo(), minCoords);
                z[zOffset] += b[betaOffset];
            }
        }
@ -184,7 +186,7 @@ static void batchnorm2_(const NDArray* input, const NDArray* mean, const NDArray
 //////////////////////////////////////////////////////////////////////////
 void batchnorm(const NDArray* input, const NDArray* mean, const NDArray* variance, const NDArray* gamma, const NDArray* beta, NDArray* output, const std::vector<int>& axes, const double epsilon) {
-    // batchnorm2_ is slower
+    // batchnorm2_ is still slower ?
    BUILD_SINGLE_SELECTOR(input->dataType(), batchnorm_, (input, mean, variance, gamma, beta, output, axes, epsilon), FLOAT_TYPES);
 }
--- a/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/imagesHelpers.cpp
@ -51,9 +51,9 @@ static void rgbToGrs_(const NDArray& input, NDArray& output, const int dimC) {
    auto func = PRAGMA_THREADS_FOR{
-         Nd4jLong coords[MAX_RANK];
+        int coords[MAX_RANK];
        for (auto i = start; i < stop; i++) {
-             shape::index2coords(i, output.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
            const auto xOffset0 =  shape::getOffset(input.getShapeInfo(), coords);
            const auto xOffset1 = xOffset0 + input.strideAt(dimC);
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@ -49,9 +49,12 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
    const auto xLen = input.lengthOf();
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+
        int coords[MAX_RANK];
        for (Nd4jLong i = 0; i < xLen; ++i) {
-            shape::index2coords(i, xShapeInfo, coords);
+
            shape::index2coordsCPU(start, i, xShapeInfo, coords);
            const auto xOffset = shape::getOffset(xShapeInfo, coords);
            const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@ -113,18 +113,23 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
    // loop through input array
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+
        int zCoords[MAX_RANK], xCoords[MAX_RANK];
        for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, output.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+            memcpy(xCoords, zCoords, rank * sizeof(int));
            // evaluate spatial coordinates for x
            for (uint j = 1; j <= numOfSpatialDims; ++j)
-                coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
+                xCoords[j] += crop.e<uint>(j - 1, 0);       // add crop left
-            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
            z[zOffset] = x[xOffset];
        }
    };
@ -299,11 +304,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
    // loop through output array
    auto func = PRAGMA_THREADS_FOR {
        Nd4jLong coords[MAX_RANK];
        for (auto i = start; i < stop; i++) {
            shape::index2coords(i, output.getShapeInfo(), coords);
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+        int zCoords[MAX_RANK], xCoords[MAX_RANK];
        for (auto i = start; i < stop; i++) {
            shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
            memcpy(xCoords, zCoords, rank * sizeof(int));
            bool within = true;
@ -312,16 +322,16 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
                const auto padLeft = padding.e<uint>(j - 1, 0);
                const auto padRight = padding.e<uint>(j - 1, 1);
-                within &= (coords[j] >= padLeft && coords[j] < output.sizeAt(j) - padRight);
+                within &= zCoords[j] >= padLeft && zCoords[j] < output.sizeAt(j) - padRight;
                if (!within)
                    break;
-                coords[j] -= padLeft;       // get coordinates for x
+                xCoords[j] = zCoords[j] - padLeft;       // get coordinates for x
            }
            if (within)
-                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
            else
                z[zOffset] = 0.f;
        }
--- a/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/scatter.cpp
@ -43,11 +43,11 @@ Nd4jLong checkIndices_(const NDArray& indices, const NDArray& output, const int
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong xCoords[MAX_RANK];
+        int xCoords[MAX_RANK];
        for (auto i = start; i < stop; i++) {
-            shape::index2coords(i, xShapeInfo, xCoords);
+            shape::index2coordsCPU(start, i, xShapeInfo, xCoords);
            const Nd4jLong currentInd = x[shape::getOffset(xShapeInfo, xCoords)];
--- a/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/split.cpp
@ -96,14 +96,17 @@ namespace helpers {
                auto func = PRAGMA_THREADS_FOR{
-                    Nd4jLong coords[MAX_RANK];
+                    int coords[MAX_RANK], temp;
                    for (auto i = start; i < stop; i += increment) {
-                        shape::index2coords(i, input.getShapeInfo(), coords);
+                        shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
                        const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
                        uint outArrIdx = 0;
                        temp = coords[axis];
                        while (coords[axis] >= zDim) {
                            coords[axis] -= zDim;
                            ++outArrIdx;
@ -112,6 +115,8 @@ namespace helpers {
                        T* z = outArrs[outArrIdx]->bufferAsT<T>();
                        const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
                        z[zOffset] = xBuff[xOffset];
                        coords[axis] = temp;
                    }
                };
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@ -188,24 +188,35 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
        const T padVal = padValue.e<T>(0);
        auto func = PRAGMA_THREADS_FOR {
-            Nd4jLong coords[MAX_RANK];
+
            int zCoords[MAX_RANK], xCoords[MAX_RANK];
            for (auto i = start; i < stop; i++) {
-                shape::index2coords(i, output.getShapeInfo(), coords);
+
-                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+                shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
                const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
                memcpy(xCoords, zCoords, rank * sizeof(int));
                bool within = true;
                for (int j = rankMinusOne; j >= 0; --j) {
-                    if (xShape[j] == zShape[j]) continue;
+
                    if (xShape[j] == zShape[j])
                        continue;
                    const auto left = paddings.e<Nd4jLong>(j, 0);
-                    if (coords[j] < left || coords[j] >= left + xShape[j]) {
+
                    if (zCoords[j] < left || zCoords[j] >= left + xShape[j]) {
                        within = false;
                        break;
                    }
-                    else { coords[j] = coords[j] - left; }
+                    else
                        xCoords[j] = zCoords[j] - left;
                }
                if (within)
-                    z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords)];
+                    z[zOffset] = x[shape::getOffset(input.getShapeInfo(), xCoords)];
                else
                    z[zOffset] = padVal;
            }
@ -219,20 +230,30 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
        const Nd4jLong shift2 = mode == 1 ? 2 : 1;         // REFLECT : SYMMETRIC
        auto func = PRAGMA_THREADS_FOR {
-            Nd4jLong coords[MAX_RANK];
+
            int zCoords[MAX_RANK], xCoords[MAX_RANK];
            for (auto i = start; i < stop; i++) {
-                shape::index2coords(i, output.getShapeInfo(), coords);
+
-                const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+                shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
                const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
                memcpy(xCoords, zCoords, rank * sizeof(int));
                for (int j = rankMinusOne; j >= 0; --j) {
-                    if (xShape[j] == zShape[j]) continue;
+                    if (xShape[j] == zShape[j])
-                    coords[j] = coords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
+                        continue;
-                    if (coords[j] < 0) coords[j] = -coords[j] - shift1;                // means fill from left
+
-                    else if (coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
+                    xCoords[j] = zCoords[j] - paddings.e<Nd4jLong>(j, 0);                             // are ready to fill middle (within input dimension range)
                    if (xCoords[j] < 0)
                        xCoords[j] = -xCoords[j] - shift1;                // means fill from left
                    else if (xCoords[j] >= xShape[j])
                        xCoords[j] = 2 * xShape[j] - xCoords[j] - shift2; // means fill from right
                }
-                const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
+                const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
                z[zOffset] = x[xOffset];
            }
        };
@ -562,45 +583,37 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
    const Nd4jLong zLen = output.lengthOf();
-    const int yLastDim = indices.sizeAt(-1);
+    const uint yLastDim = indices.sizeAt(-1);
    const int diff = zRank - xRank;
    const bool bEqual = yLastDim == xRank;
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK * 3];
+
        int xCoords[MAX_RANK], zCoords[MAX_RANK], temp;
        for (auto i = start; i < stop; i++) {
            Nd4jLong *zCoordStart, *xCoordStart;
-            if (yLastDim == xRank) {
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), zCoords);
                zCoordStart = coords;
                xCoordStart = coords;
            } else if (zRank >= xRank) {
                zCoordStart = coords;
                xCoordStart = coords + zRank - xRank;
            } else {
                zCoordStart = coords + xRank - zRank;
                xCoordStart = coords;
            }
-            shape::index2coords(i, output.getShapeInfo(), zCoordStart);
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoords);
-            const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
+            temp = zCoords[yRank - 1];
            zCoords[yRank - 1] = 0;
            const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoords);
            zCoords[yRank - 1] = temp;
-            // last y coordinate
+            if(bEqual)
-            uint coordToRestore;
+                memcpy(xCoords, zCoords, zRank * sizeof(int));
-            if (yLastDim != xRank)
+            else if(diff >= 0)
-                coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
+                memcpy(xCoords, zCoords + diff, xRank * sizeof(int));
            else
                memcpy(xCoords - diff, zCoords, zRank * sizeof(int));
-            zCoordStart[yRank - 1] = 0;
+            for (uint j = 0; j < yLastDim; ++j)
-            const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
+                xCoords[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
-            //restore z coordinate
+            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoords);
            if (yLastDim != xRank)
                zCoordStart[yRank - 1] = coordToRestore;
            // construct coordinates for x
            for (int j = 0; j < yLastDim; ++j)
                xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
            const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
            z[zOffset] = x[xOffset];
        }
@ -1188,10 +1201,12 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
    else {
        auto func = PRAGMA_THREADS_FOR {
-            Nd4jLong inIdx[MAX_RANK];
+
-            Nd4jLong outIdx[MAX_RANK];
+            int inIdx[MAX_RANK], outIdx[MAX_RANK];
            for (auto i = start; i < stop; i++) {
-                shape::index2coords(i, output.getShapeInfo(), outIdx);
+
                shape::index2coordsCPU(start, i, output.getShapeInfo(), outIdx);
                for (int j = 0; j < rank; ++j) {
                    const Nd4jLong inLen = input.sizeAt(j);
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@ -52,7 +52,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
 	__syncthreads();
 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-	Nd4jLong coords[MAX_RANK];
+	int coords[MAX_RANK];
 	for (int i = tid; i < xzLen; i += blockDim.x * gridDim.x) {
    	shape::index2coords(i, xShapeInfo, coords);
@ -124,7 +124,7 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 	__syncthreads();
 	const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-	Nd4jLong coords[MAX_RANK];
+	int coords[MAX_RANK];
 	for (int i = tid; i < inLen; i += totalThreads) {
    	shape::index2coords(i, inShapeInfo, coords);
--- a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
@ -45,7 +45,7 @@ __global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo,
          X* z = reinterpret_cast<X*>(vz);
    __shared__ int rank, channelPosition, posOfNonUnityDim;
-    __shared__ Nd4jLong *sharedMem, len;
+    __shared__ Nd4jLong len, *sharedMem;
    __shared__ bool xzSameOffsets, xzAreSame;
    if (threadIdx.x == 0) {
@ -130,7 +130,7 @@ void addBias(sd::graph::Context& block, const NDArray& input, const NDArray& bia
                              FLOAT_TYPES, FLOAT_TYPES);
    } else {
        // default case
-        const int threadsPerBlock = MAX_NUM_THREADS / 2;
+        const int threadsPerBlock = MAX_NUM_THREADS / 4;
        const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
        const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
--- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
@ -124,7 +124,7 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
    }
    __syncthreads();
-    Nd4jLong coords[MAX_RANK];
+    int coords[MAX_RANK];
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@ -51,7 +51,7 @@ __global__ static void concatCuda(void* pVx,  void* pxShapeInfo, void* vz, Nd4jL
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong coords[MAX_RANK];
+    int coords[MAX_RANK];
    for (uint64_t i = tid; i < zLen; i += totalThreads) {
        shape::index2coords(i, zShapeInfo, coords);
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@ -706,7 +706,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
          T* z = reinterpret_cast<T*>(vz);
    __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -858,7 +858,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
    Nd4jLong coord2, coord3;
    __shared__ int rank, kHeff, kWeff, iH, iW, kProd;
-    __shared__ Nd4jLong *sharedMem, yLen;
+    __shared__ Nd4jLong yLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -1017,7 +1017,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
    Nd4jLong coord2, coord3, coord4;
    __shared__ int rank, kDeff, kHeff, kWeff, iD, iH, iW, kProd;
-    __shared__ Nd4jLong *sharedMem, yLen;
+    __shared__ Nd4jLong yLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -1342,7 +1342,7 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
          T* z = reinterpret_cast<T*>(vz);
    __shared__ int rank, dimIH;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -1410,7 +1410,7 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
          T* z = reinterpret_cast<T*>(vz);
    __shared__ int rank, dimID;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -1480,7 +1480,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
    __shared__ int rank, dimIH;
    __shared__ uint factorH, factorW;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
@ -1554,7 +1554,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
    __shared__ int rank, dimID;
    __shared__ uint factorD, factorH, factorW;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
--- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
@ -36,8 +36,8 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
    __shared__ const T* x;
    __shared__ const T* y;
    __shared__ 		 T* z;
-    __shared__ int rank;
+    __shared__ int rank, *sharedMem;
-    __shared__ Nd4jLong lenWithoutLastDim, totalThreads, *sharedMem;
+    __shared__ Nd4jLong lenWithoutLastDim, totalThreads;
    if (threadIdx.x == 0) {
    	x = reinterpret_cast<const T*>(vx);
@ -45,7 +45,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
    	z = reinterpret_cast<T*>(vz);
        extern __shared__ unsigned char shmem[];
-        sharedMem    = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem    = reinterpret_cast<int*>(shmem);
        totalThreads = gridDim.x * blockDim.x;
        rank              = shape::rank(xShapeInfo);
@ -106,7 +106,7 @@ void crossBatched(sd::LaunchContext* context, NDArray *x, NDArray *y, NDArray *z
 	const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (x->lengthOf() / x->sizeAt(-1) + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = sizeof(Nd4jLong) * threadsPerBlock * x->rankOf() + 128;
+    const int sharedMem = sizeof(int) * threadsPerBlock * x->rankOf() + 128;
    PointersManager manager(context, "cross");
--- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
@ -43,13 +43,13 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
    const X* y = reinterpret_cast<const X*>(vy);
          Z* z = reinterpret_cast<Z*>(vz);
-    __shared__ int xzRank, yRank;
+    __shared__ int xzRank, yRank, *sharedMem;
    __shared__ uint iH, iW, kH, kW;
-    __shared__ Nd4jLong *sharedMem, zLen;
+    __shared__ Nd4jLong zLen;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        zLen = shape::length(zShapeInfo);
@ -119,7 +119,7 @@ void dilation2d(sd::LaunchContext* context, NDArray *input, NDArray *weights, ND
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
    const int blocksPerGrid = (output->lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = (weights->rankOf() + output->rankOf()) * sizeof(int) * threadsPerBlock  + 128;
    NDArray::prepareSpecialUse({output}, {input, weights});
    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), dilation2dCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input->getSpecialBuffer(), input->getSpecialShapeInfo(), weights->getSpecialBuffer(), weights->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), sH, sW, pH, pW, dH, dW), FLOAT_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@ -27,7 +27,7 @@ namespace sd {
            template <typename T>
            void _CUDA_G flattenKernel(void **xBuffers, Nd4jLong **xShapeInfos, Nd4jLong *offsets, Nd4jLong numInputs, void *zBuffer, Nd4jLong *zShapeInfo, char order) {
-                Nd4jLong xCoord[MAX_RANK];
+                int xCoord[MAX_RANK];
                // each block of threads works on 1 input array
                for (Nd4jLong e = blockIdx.x; e < numInputs; e += gridDim.x) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
@ -40,12 +40,12 @@ __global__ static void im2colCuda(const void *image, void *columns,
    const auto im  = reinterpret_cast<const T*>(image);
          auto col = reinterpret_cast<T*>(columns);
-    __shared__ Nd4jLong colLen, *sharedMem, iH, iW;
+    __shared__ Nd4jLong colLen, iH, iW;
-    __shared__ int imRank, colRank;
+    __shared__ int imRank, colRank, *sharedMem;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        colRank = 6;
        imRank  = 4;
@ -81,7 +81,7 @@ __global__ static void im2colCuda(const void *image, void *columns,
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
 static void im2colCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, sd::LaunchContext & context, const void *image, void *columns, const Nd4jLong *imShapeInfo, const Nd4jLong *colShapeInfo, int sH, int sW, int pH, int pW, int dH, int dW, double zeroPadVal) {
-    im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(Nd4jLong) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
+    im2colCuda<T><<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int) * 6 /* rank of columns = 6 */, *context.getCudaStream()>>>(image, columns, imShapeInfo, colShapeInfo, sH, sW, pH, pW, dH, dW, zeroPadVal);
 }
 //////////////////////////////////////////////////////////////////////////
--- a/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/imagesHelpers.cu
@ -149,19 +149,19 @@ __global__ void rgbToGrsCuda(const void *vx, const Nd4jLong *xShapeInfo, void *v
 	const auto x = reinterpret_cast<const T*>(vx);
 		  auto z = reinterpret_cast<T*>(vz);
-	__shared__ Nd4jLong zLen, *sharedMem;
+	__shared__ Nd4jLong zLen;
-	__shared__ int rank;	// xRank == zRank
+	__shared__ int rank, *sharedMem;	// xRank == zRank
 	if (threadIdx.x == 0) {
 		extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
 		zLen = shape::length(zShapeInfo);
 		rank = shape::rank(zShapeInfo);
 	}
 	__syncthreads();
-	Nd4jLong* coords = sharedMem + threadIdx.x * rank;
+	auto coords = sharedMem + threadIdx.x * rank;
 	for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i +=  gridDim.x * blockDim.x) {
@ -197,7 +197,7 @@ void transformRgbGrs(sd::LaunchContext* context, const NDArray& input, NDArray&
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock + 128;
+    const int sharedMem = input.rankOf() * sizeof(int) * threadsPerBlock + 128;
 	NDArray::prepareSpecialUse({&output}, {&input});
 	BUILD_SINGLE_SELECTOR(input.dataType(), rgbToGrsCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.getSpecialBuffer(), output.getSpecialShapeInfo(), dimC), NUMERIC_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@ -39,14 +39,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
    const auto y = reinterpret_cast<const T*>(vy);
          auto z = reinterpret_cast<T*>(vz);
-    __shared__ int xRank;       // xRank = zRank, xRank = yRank + 1
+    __shared__ int xRank, *sharedMem;       // xRank = zRank, xRank = yRank + 1
-    __shared__ Nd4jLong xLen, *sharedMem;   // xLen = zLen
+    __shared__ Nd4jLong xLen;   // xLen = zLen
    __shared__ bool areSameOffsets;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);    // shapes are definitely the same, but strides might not
@ -56,7 +56,7 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
    __syncthreads();
-    auto coords = sharedMem + threadIdx.x * xRank;               // we provide (xRank * sizeof(Nd4jLong) * threadIdx.x) amount of shared memory per each thread
+    auto coords = sharedMem + threadIdx.x * xRank;               // we provide (xRank * sizeof(int) * threadIdx.x) amount of shared memory per each thread
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
@ -86,7 +86,7 @@ void matrixSetDiag(sd::LaunchContext* context, const NDArray& input, const NDArr
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
    const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * input.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * input.rankOf() + 128;
    PointersManager manager(context, "matrixSetDiag");
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@ -43,12 +43,12 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
    const auto x = reinterpret_cast<const T*>(vx);
          auto z = reinterpret_cast<T*>(vz);
-    __shared__ int rank;
+    __shared__ int rank, *sharedMem;
-    __shared__ Nd4jLong zLen, *sharedMem;
+    __shared__ Nd4jLong zLen;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
@ -103,7 +103,7 @@ void batchToSpace(sd::LaunchContext* context, const NDArray& input, NDArray& out
        const int threadsPerBlock = MAX_NUM_THREADS / 2;
        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
        PointersManager manager(context, "batchToSpace");
@ -138,13 +138,13 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape
    const auto y = reinterpret_cast<const Y*>(vy);
          auto z = reinterpret_cast<X*>(vz);
-    __shared__ int rank;
+    __shared__ int rank, *sharedMem;
-    __shared__ Nd4jLong zLen, *sharedMem;
+    __shared__ Nd4jLong zLen;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
@ -234,7 +234,7 @@ void batchToSpaceND(sd::LaunchContext* context, const NDArray& input, const NDAr
        const int threadsPerBlock = MAX_NUM_THREADS / 4;
        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
        PointersManager manager(context, "batchToSpaceND");
@ -264,12 +264,12 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
    const auto x = reinterpret_cast<const T*>(vx);
          auto z = reinterpret_cast<T*>(vz);
-    __shared__ int rank;
+    __shared__ int rank, *sharedMem;
-    __shared__ Nd4jLong zLen, *sharedMem;
+    __shared__ Nd4jLong zLen;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
@ -326,7 +326,7 @@ void spaceToBatch(sd::LaunchContext* context, const NDArray& input, NDArray& out
        const int threadsPerBlock = MAX_NUM_THREADS / 2;
        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
        PointersManager manager(context, "spaceToBatch");
@ -364,13 +364,13 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape
    const auto y = reinterpret_cast<const Y*>(vy);
          auto z = reinterpret_cast<X*>(vz);
-    __shared__ int rank;    // xRank = zRank, yRank = 2;
+    __shared__ int rank, *sharedMem;    // xRank = zRank, yRank = 2;
-    __shared__ Nd4jLong zLen, totalThreads, *sharedMem;
+    __shared__ Nd4jLong zLen, totalThreads;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        rank  = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
@ -473,7 +473,7 @@ void spaceToBatchND(sd::LaunchContext* context, const NDArray& input, const NDAr
        const int threadsPerBlock = MAX_NUM_THREADS / 4;
        const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-        const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * output.rankOf() + 128;
+        const int sharedMem = threadsPerBlock * sizeof(int) * output.rankOf() + 128;
        PointersManager manager(context, "spaceToBatchND");
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@ -628,12 +628,12 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
          auto y = reinterpret_cast<Z*>(vy);
          auto z = reinterpret_cast<Z*>(vz);
-    __shared__ Nd4jLong xLen, *sharedMem;
+    __shared__ Nd4jLong xLen;
-    __shared__ int xRank;   // xRank = zRank, yRank = xRank + 1
+    __shared__ int xRank, *sharedMem;   // xRank = zRank, yRank = xRank + 1
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        xLen  = shape::length(xShapeInfo);
        xRank = shape::rank(xShapeInfo);
@ -678,7 +678,7 @@ void scatterForLoss(sd::LaunchContext* context, const NDArray& indices, NDArray&
    const int threadsPerBlock = MAX_NUM_THREADS / 2;
    const int blocksPerGrid = (indices.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = updates.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+    const int sharedMem = updates.rankOf() * sizeof(int) * threadsPerBlock  + 128;
    if(calcGrad) {
        NDArray::prepareSpecialUse({&updates}, {&indices});
--- a/libnd4j/include/ops/declarable/helpers/cuda/split.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/split.cu
@ -54,7 +54,7 @@ __global__ static void splitCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong coords[MAX_RANK];
+    int coords[MAX_RANK];
    for (uint64_t i = tid; i < xLen; i += totalThreads) {
--- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
@ -135,13 +135,13 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
    const int rank = 3;
-    __shared__ int time, K;
+    __shared__ int time, K, *sharedMem;
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;
+    __shared__ Nd4jLong len, totalThreads;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        time = xShapeInfo[1];
        K    = xShapeInfo[3] / 2;
@ -152,7 +152,7 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
    __syncthreads();
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong* coords = sharedMem + threadIdx.x * rank;
+    auto coords = sharedMem + threadIdx.x * rank;
    if(tid >= len)
        return;
@ -245,7 +245,7 @@ void sruBI(sd::LaunchContext * context, NDArray* x, const NDArray* w, const NDAr
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock;      // loop through last two dimensions of x array -> bS, 2*K
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
    NDArray::prepareSpecialUse({ht, ct}, {x, &wi, b, c0, mask});
    BUILD_SINGLE_SELECTOR(x->dataType(), sruBICudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ht->specialBuffer(), ht->specialShapeInfo(), ct->specialBuffer(), ct->specialShapeInfo()), FLOAT_TYPES);
@ -340,13 +340,13 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
    const int rank = 3;
-    __shared__ int time, K;
+    __shared__ int time, K, *sharedMem;
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;
+    __shared__ Nd4jLong len, totalThreads;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        time = xShapeInfo[1];
        K    = xShapeInfo[3] / 2;
@ -358,7 +358,7 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
    __syncthreads();
    const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    Nd4jLong* coords = sharedMem + threadIdx.x * rank;
+    auto coords = sharedMem + threadIdx.x * rank;
    if(tid >= len)
        return;
@ -513,7 +513,7 @@ void sruBIBP(sd::LaunchContext* context, NDArray* x, const NDArray* w, const NDA
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (x->sizeAt(1) * x->sizeAt(2) + threadsPerBlock - 1) / threadsPerBlock;      // loop through last two dimensions of x array -> bS, 2*K
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * x->rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * x->rankOf() + 128;
    NDArray::prepareSpecialUse({gradI, &gradWi, &gradBias, gradC0}, {x, &wi, b, c0, ct, gradCt, gradHt, mask});
    BUILD_SINGLE_SELECTOR(x->dataType(), sruBIBPCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), x->getSpecialBuffer(), x->getSpecialShapeInfo(), wi.getSpecialBuffer(), wi.getSpecialShapeInfo(), b->getSpecialBuffer(), b->getSpecialShapeInfo(), c0->getSpecialBuffer(), c0->getSpecialShapeInfo(), mask ? mask->getSpecialBuffer() : nullptr, mask ? mask->getSpecialShapeInfo() : nullptr, ct->getSpecialBuffer(), ct->getSpecialShapeInfo(), gradHt->getSpecialBuffer(), gradHt->getSpecialShapeInfo(), gradCt->getSpecialBuffer(), gradCt->getSpecialShapeInfo(), gradI->specialBuffer(), gradI->specialShapeInfo(), gradWi.specialBuffer(), gradWi.specialShapeInfo(), gradBias.specialBuffer(), gradBias.specialShapeInfo(), gradC0->specialBuffer(), gradC0->specialShapeInfo()), FLOAT_TYPES);
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@ -93,13 +93,13 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
          auto z = reinterpret_cast<T*>(vz);
    __shared__ T* sharedMem;
-    __shared__ int xRank, zRank;        // xRank = zRank + 2
+    __shared__ int xRank, zRank, *coordsMem;        // xRank = zRank + 2
-    __shared__ Nd4jLong xLen, zLen, *coordsMem;
+    __shared__ Nd4jLong xLen, zLen;
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
        sharedMem = reinterpret_cast<T*>(shmem);
-        coordsMem = reinterpret_cast<Nd4jLong*>(shmem + blockDim.x * sizeof(T));
+        coordsMem = reinterpret_cast<int*>(shmem + blockDim.x * sizeof(T));
        xRank = shape::rank(xShapeInfo);
        zRank = shape::rank(zShapeInfo);
@ -109,7 +109,7 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
    }
    __syncthreads();
-    Nd4jLong* coords = coordsMem + threadIdx.x * xRank;
+    auto coords = coordsMem + threadIdx.x * xRank;
    for (uint m = blockIdx.x; m < zLen; m += gridDim.x) {   // one block per each element of z, that is per each matrix
@ -160,7 +160,7 @@ void trace(sd::LaunchContext* context, const NDArray& input, NDArray& output) {
    const uint diagLen = input.sizeAt(-1) < input.sizeAt(-2) ? input.sizeAt(-1) : input.sizeAt(-2);
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * (sizeof(Nd4jLong) * input.rankOf() + input.sizeOfT()) + 128;
+    const int sharedMem = threadsPerBlock * (sizeof(int) * input.rankOf() + input.sizeOfT()) + 128;
    NDArray::prepareSpecialUse({&output}, {&input});
    BUILD_SINGLE_SELECTOR(input.dataType(), traceCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), diagLen), LIBND4J_TYPES);
@ -177,13 +177,13 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
    const auto x = reinterpret_cast<const T*>(vx);  // gradO
          auto z = reinterpret_cast<T*>(vz);        // gradI
-    __shared__ int rank, areSameOffsets;                // xRank = zRank
+    __shared__ int rank, areSameOffsets, *sharedMem;                // xRank = zRank
-    __shared__ Nd4jLong len, totalThreads, *sharedMem;  // xLen = zLen
+    __shared__ Nd4jLong len, totalThreads;  // xLen = zLen
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        areSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
        rank = shape::rank(xShapeInfo);
        len  = shape::length(zShapeInfo);
@ -221,7 +221,7 @@ void triuBP(sd::LaunchContext* context, const NDArray& input, const NDArray& gra
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (gradO.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * gradO.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * gradO.rankOf() + 128;
    PointersManager manager(context, "triuBP");
@ -240,13 +240,13 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
    const auto x = reinterpret_cast<const T*>(vx);  // gradO
          auto z = reinterpret_cast<T*>(vz);        // gradI
-    __shared__ int xRank, zRank;                // xRank >= zRank
+    __shared__ int xRank, zRank, *sharedMem;                // xRank >= zRank
-    __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads, *sharedMem;  // xLen >= zLen
+    __shared__ Nd4jLong numOfXOffsets, zLen, totalThreads;  // xLen >= zLen
    if (threadIdx.x == 0) {
        extern __shared__ unsigned char shmem[];
-        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+        sharedMem = reinterpret_cast<int*>(shmem);
        xRank = shape::rank(zShapeInfo);
        zLen  = shape::length(zShapeInfo);
@ -289,7 +289,7 @@ void tileBP(sd::LaunchContext * context, const NDArray& gradO /*input*/, NDArray
    const int threadsPerBlock = MAX_NUM_THREADS / 4;
    const int blocksPerGrid = (gradI.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    const int sharedMem = threadsPerBlock * sizeof(Nd4jLong) * 2 * gradO.rankOf() + 128;
+    const int sharedMem = threadsPerBlock * sizeof(int) * 2 * gradO.rankOf() + 128;
    PointersManager manager(context, "tileBP");
--- a/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/sparse_to_dense.cpp
@ -32,7 +32,7 @@ namespace sd {
                auto indices = reinterpret_cast<const I*>(vindices);
                auto output = reinterpret_cast<X*>(voutput);
-                Nd4jLong coords[MAX_RANK];
+                int coords[MAX_RANK];
                uint64_t pos = 0;
                for (uint64_t e = 0L; e < length; e++) {
                    // indices come in blocks
--- a/libnd4j/include/ops/declarable/helpers/impl/where.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
@ -29,11 +29,14 @@ namespace sd {
                NDArrayList list(0, true);
                int cnt = 0;
-                Nd4jLong idx[MAX_RANK];
+                int idx[MAX_RANK];
                for (Nd4jLong e = 0; e < condition.lengthOf(); e++) {
-                    shape::index2coords(e, condition.getShapeInfo(), idx);
+
                    shape::index2coordsCPU(0, e, condition.getShapeInfo(), idx);
                    auto offset = shape::getOffset(condition.getShapeInfo(), idx);
                    if (condition.e<bool>(offset)) {
                        auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext());
                        for (int f = 0; f < condition.rankOf(); f++)
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@ -178,16 +178,18 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
    // general case
    auto func = PRAGMA_THREADS_FOR {
-        Nd4jLong coords[MAX_RANK];
+        int coords[MAX_RANK], temp;
        for (auto i = start; i < stop; i += increment) {
-            shape::index2coords(i, output.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, output.getShapeInfo(), coords);
            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
            uint inArrIdx = 0;
            uint xDim = inArrs[inArrIdx]->sizeAt(axis);
            temp = coords[axis];
            while (coords[axis] >= xDim) {
                coords[axis] -= xDim;
                xDim = inArrs[++inArrIdx]->sizeAt(axis);
@ -197,6 +199,8 @@ void SpecialMethods<T>::concatCpuGeneric(const std::vector<const NDArray*>& inAr
            const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
            zBuff[zOffset] = x[xOffset];
            coords[axis] = temp;
        }
    };
@ -298,13 +302,15 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
    auto func = PRAGMA_THREADS_FOR{
-        Nd4jLong coords[MAX_RANK];
+        int coords[MAX_RANK], temp;
        for (auto i = start; i < stop; i += increment) {
-            shape::index2coords(i, input.getShapeInfo(), coords);
+            shape::index2coordsCPU(start, i, input.getShapeInfo(), coords);
            const auto xOffset = shape::getOffset(input.getShapeInfo(), coords);
            uint outArrIdx = 0;
            temp = coords[axis];
            while (coords[axis] >= zDim) {
                coords[axis] -= zDim;
@ -314,6 +320,8 @@ void SpecialMethods<T>::splitCpuGeneric(const NDArray& input, const std::vector<
            T* z = outArrs[outArrIdx]->bufferAsT<T>();
            const auto zOffset = shape::getOffset(outArrs[outArrIdx]->getShapeInfo(), coords);
            z[zOffset] = xBuff[xOffset];
            coords[axis] = temp;
        }
    };
--- a/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/PlaygroundTests.cpp
@ -258,6 +258,7 @@ TEST_F(PlaygroundTests, test_bert_2) {
    delete graph;
 }
 TEST_F(PlaygroundTests, test_one_off_ops_1) {
    auto x = NDArrayFactory::create<float>('c', {4, 128, 768});
    auto y = NDArrayFactory::create<float>('c', {4, 128, 1});
--- a/libnd4j/tests_cpu/layers_tests/TadTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/TadTests.cpp
@ -289,7 +289,7 @@ TEST_F(TadTests, calcOffsets_1) {
 TEST_F(TadTests, outerArrayIndexes_1) {
    NDArray x('c', {2,3,4,5}, sd::DataType::FLOAT32);
-    Nd4jLong maxIdxs[120];
+    int maxIdxs[120];
    NDArray y1('c', {3,5}, sd::DataType::FLOAT32);
    const std::vector<int> dimsToExclude1 = {0,2};