diff --git a/libnd4j/blas/NDArray.h b/libnd4j/blas/NDArray.h
index 3a57fc92b..21eedc665 100644
--- a/libnd4j/blas/NDArray.h
+++ b/libnd4j/blas/NDArray.h
@@ -1770,7 +1770,7 @@ NDArray NDArray::operator()(const Nd4jLong i) const {
     } else {
         Nd4jLong idx[MAX_RANK];
         shape::ind2subC(rankOf(), shapeOf(), i, idx);
-        auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
+        auto xOffset = shape::getOffset(getShapeInfo(), idx);
 
         auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
         NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1801,7 +1801,7 @@ NDArray& NDArray::operator()(const Nd4jLong i) {
     } else {
         Nd4jLong idx[MAX_RANK];
         shape::ind2subC(rankOf(), shapeOf(), i, idx);
-        auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
+        auto xOffset = shape::getOffset(getShapeInfo(), idx);
 
         auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
         NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1818,7 +1818,7 @@ NDArray NDArray::operator()(const Nd4jLong i, const Nd4jLong j) const {
        throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !");
 
     Nd4jLong coords[2] = {i, j};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     // TODO: do we really want a view here?
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
@@ -1834,7 +1834,7 @@ NDArray& NDArray::operator()(const Nd4jLong  i, const Nd4jLong j) {
        throw std::invalid_argument("NDArray::operator(i,j): one of input indexes is out of array length or rank!=2 !");
 
     Nd4jLong coords[2] = {i, j};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
     NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1853,7 +1853,7 @@ NDArray NDArray::operator()(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k
        throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !");
 
     Nd4jLong coords[3] = {i, j, k};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
     NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1870,7 +1870,7 @@ NDArray& NDArray::operator()(const Nd4jLong i, const Nd4jLong j, const Nd4jLong
        throw std::invalid_argument("NDArray::operator(i,j,k): one of input indexes is out of array length or rank!=3 !");
 
     Nd4jLong coords[3] = {i, j, k};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
     NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1886,7 +1886,7 @@ NDArray NDArray::operator()(const Nd4jLong t, const Nd4jLong u, const Nd4jLong v
        throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !");
 
     Nd4jLong coords[4] = {t, u, v, w};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
     NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1900,7 +1900,7 @@ NDArray& NDArray::operator()(const Nd4jLong t, const Nd4jLong u, const Nd4jLong
        throw std::invalid_argument("NDArray::operator(t,u,v,w): one of input indexes is out of array length or rank!=4 !");
 
     Nd4jLong coords[4] = {t, u, v, w};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     // FIXME
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
@@ -1916,7 +1916,7 @@ NDArray NDArray::operator()(const Nd4jLong* idx) const {
         if (idx[i] >= sizeAt(i))
             throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !");
 
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), idx);
 
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
     NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -1931,7 +1931,7 @@ NDArray& NDArray::operator()(const Nd4jLong* idx) {
         if (idx[i] >= sizeAt(i))
             throw std::invalid_argument("NDArray::operator(const Nd4jLong* idx): input index is out of dimension length !");
 
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), idx, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), idx);
 
     auto cast = reinterpret_cast<int8_t *>(_buffer) + (xOffset * this->sizeOfT());
     NDArray result(cast, nd4j::ShapeBuilders::createScalarShapeInfo(this->dataType(), this->getWorkspace()));
@@ -2067,7 +2067,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j) {
         syncToHost();
 
     Nd4jLong coords[2] = {i, j};
-    auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto offset = shape::getOffset(getShapeInfo(), coords);
     tickWriteHost();
     return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
 }
@@ -2084,7 +2084,7 @@ T& NDArray::t(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) {
         syncToHost();
 
     Nd4jLong coords[3] = {i, j, k};
-    auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto offset = shape::getOffset(getShapeInfo(), coords);
     tickWriteHost();
     return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
 }
@@ -2118,7 +2118,7 @@ T NDArray::t(const Nd4jLong i, const Nd4jLong j) const {
         syncToHost();
 
     Nd4jLong coords[2] = {i, j};
-    auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto offset = shape::getOffset(getShapeInfo(), coords);
     tickReadHost();
     return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
 }
@@ -2135,7 +2135,7 @@ T NDArray::t(const Nd4jLong i, const Nd4jLong j) const {
             syncToHost();
 
         Nd4jLong coords[3] = {i, j, k};
-        auto offset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+        auto offset = shape::getOffset(getShapeInfo(), coords);
         tickReadHost();
         return *(reinterpret_cast<T*>(bufferWithOffset(offset)));
     }
diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 82427f9b9..0f0621a80 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -808,7 +808,7 @@ void NDArray::templatedSet(void *buffer, const Nd4jLong *indices, const void *va
     auto t = reinterpret_cast<T *>(buffer);
     const auto y = *(reinterpret_cast<const Y *>(value));
 
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), indices, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), indices);
     t[xOffset] = static_cast<T>(y);
 }
 BUILD_DOUBLE_TEMPLATE(template void NDArray::templatedSet, (void *buffer, const Nd4jLong *indices, const void *value), LIBND4J_TYPES, LIBND4J_TYPES);
@@ -2462,14 +2462,13 @@ double NDArray::getTrace() const {
 
     int  rank    = rankOf();
     auto shape   = shapeOf();
-    auto strides = stridesOf();
     int  minDim  = 100000000;
 
     Nd4jLong indices[MAX_RANK];
     for(int j = 0; j < rank; ++j)
         indices[j] = 1;
 
-    auto offset = shape::getOffset(0, shape, strides, indices, rank);
+    auto offset = shape::getOffset(getShapeInfo(), indices);
 
     for(int i = 0; i < rank; ++i)
         if(minDim > shape[i])
@@ -3472,7 +3471,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j) const {
         throw std::invalid_argument("NDArray::e(i,j): one of input indexes is out of array length or rank!=2 !");
 
     const Nd4jLong coords[2] = {i, j};
-    const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    const auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     NDArray::preparePrimaryUse({}, {this});
     NDArray::registerPrimaryUse({}, {this});
@@ -3492,7 +3491,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k) const {
         throw std::invalid_argument("NDArray::e(i,j,k): one of input indexes is out of array length or rank!=3 !");
 
     const Nd4jLong coords[3] = {i, j, k};
-    const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    const auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     NDArray::preparePrimaryUse({}, {this});
     NDArray::registerPrimaryUse({}, {this});
@@ -3512,7 +3511,7 @@ T NDArray::e(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4jLon
         throw std::invalid_argument("NDArray::e(i,j,k,l): one of input indexes is out of array length or rank!=4 !");
 
     const Nd4jLong coords[4] = {i, j, k, l};
-    const auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    const auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     NDArray::preparePrimaryUse({}, {this});
     NDArray::registerPrimaryUse({}, {this});
@@ -4095,7 +4094,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const T value) {
 
     void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
     Nd4jLong coords[2] = {i, j};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     NDArray::preparePrimaryUse({this}, {}, true);
     BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
@@ -4127,7 +4126,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const T va
 
     void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
     Nd4jLong coords[3] = {i, j, k};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
     BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
     NDArray::registerPrimaryUse({this}, {});
 }
@@ -4154,7 +4153,7 @@ void NDArray::p(const Nd4jLong i, const Nd4jLong j, const Nd4jLong k, const Nd4j
 
     void *p = reinterpret_cast<void *>(const_cast<T *>(&value));
     Nd4jLong coords[4] = {i, j, k, l};
-    auto xOffset = shape::getOffset(0, shapeOf(), stridesOf(), coords, rankOf());
+    auto xOffset = shape::getOffset(getShapeInfo(), coords);
 
     NDArray::preparePrimaryUse({this}, {}, true);
     BUILD_SINGLE_PARTIAL_SELECTOR(dataType(), templatedSet<, T>(this->getBuffer(), xOffset, p), LIBND4J_TYPES);
@@ -4409,7 +4408,7 @@ Nd4jLong NDArray::getOffset(const Nd4jLong i) const {
     if (i >= lengthOf())
         throw std::invalid_argument("NDArray::getOffset: input index is out of array length !");
 
-    return shape::getIndexOffset(i, _shapeInfo, lengthOf());
+    return shape::getIndexOffset(i, _shapeInfo);
 }
 
 NDArray NDArray::like() {
@@ -4455,7 +4454,7 @@ NDArray* NDArray::diagonal(const char type) const {
             indices[i] = 1;
         }
 
-        auto step = shape::getOffset(0, shapeOf(), stridesOf(), indices, rank);
+        auto step = shape::getOffset(getShapeInfo(), indices);
 
         if(type == 'c') {
             outShapeInfo[1] = diagSize;
diff --git a/libnd4j/blas/cpu/NDArray.cpp b/libnd4j/blas/cpu/NDArray.cpp
index 24ef100d3..03c7c53e1 100644
--- a/libnd4j/blas/cpu/NDArray.cpp
+++ b/libnd4j/blas/cpu/NDArray.cpp
@@ -103,8 +103,8 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
     PRAGMA_OMP_PARALLEL_FOR_ARGS(OMP_IF(zLen > Environment::getInstance()->elementwiseThreshold()) firstprivate(coords))
     for (Nd4jLong i = 0; i < zLen; ++i) {
 
-        shape::index2coords(zRank, target->shapeOf(), i, zLen, coords.data());
-        const auto zOffset = shape::getOffset(0, target->shapeOf(), target->stridesOf(), coords.data(), zRank);
+        shape::index2coords(i, target->getShapeInfo(), coords.data());
+        const auto zOffset = shape::getOffset(target->getShapeInfo(), coords.data());
 
         // if( (row + upper < col) || (row + lower > col) )
         if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
@@ -112,7 +112,7 @@ void NDArray::fillAsTriangular(const float val, int lower, int upper, const char
         else if(this != target) {      // when this and target are different arrays
             if(xRank != zRank)
                 coords[0] = coords[1];
-            const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(0, shapeOf(), stridesOf(), coords.data(), xRank);
+            const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(getShapeInfo(), coords.data());
             z[zOffset] = x[xOffset];
         }
     }
@@ -128,13 +128,12 @@ void NDArray::setIdentity() {
 
     int  rank    = rankOf();
     auto shape   = shapeOf();
-    auto strides = stridesOf();
     int  minDim  = MAX_INT;
     Nd4jLong indices[MAX_RANK];
     for(int j = 0; j < rank; ++j)
         indices[j] = 1;
 
-    Nd4jLong offset = shape::getOffset(0, shape, strides, indices, rank);
+    Nd4jLong offset = shape::getOffset(getShapeInfo(), indices);
 
     for(int i = 0; i < rank; ++i)
         if(minDim > shape[i])
@@ -380,9 +379,9 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
     PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
     for (Nd4jLong i = 0; i < zLen; ++i) {
 
-        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
+        shape::index2coords(i, output.getShapeInfo(), coords.data());
 
-        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
+        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
 
         if(repSize > 1) {
             for (uint j = 0; j < repSize; ++j) {
@@ -396,7 +395,7 @@ static void repeat_(const NDArray& input, NDArray& output, const std::vector<int
         else
             coords[axis] /= repeats[0];
 
-        z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
+        z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
     }
 }
 
diff --git a/libnd4j/blas/cpu/NativeOps.cpp b/libnd4j/blas/cpu/NativeOps.cpp
index e016d58fe..b95486492 100644
--- a/libnd4j/blas/cpu/NativeOps.cpp
+++ b/libnd4j/blas/cpu/NativeOps.cpp
@@ -1385,8 +1385,8 @@ void pullRowsGeneric(void *vx,
         }
         else {
             for (int i = 0; i < tadLength; i++) {
-                auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo, tadLength);
-                auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo, tadLength);
+                auto xOffset = xTadOffsetForBlock + shape::getIndexOffset(i, tadShapeInfo);
+                auto zOffset = zTadOffsetForBlock + shape::getIndexOffset(i, zTadShapeInfo);
                 hZ[zOffset] = hX[xOffset];
             }
         }
@@ -1450,7 +1450,7 @@ void tearGeneric(void *vx,
         else {
 
             for (Nd4jLong j = 0; j < tadLength; j++)
-                hZ[shape::getIndexOffset(j, hZShapeInfo, tadLength)] = s[shape::getIndexOffset(j, tadShapeInfo, tadLength)];
+                hZ[shape::getIndexOffset(j, hZShapeInfo)] = s[shape::getIndexOffset(j, tadShapeInfo)];
         }
     }
 }
@@ -1597,7 +1597,7 @@ void shuffleGeneric(void **hX, Nd4jLong **hXShapeInfo, void **dz, Nd4jLong **hZS
                     }
                 } else {
                     for (Nd4jLong i = 0; i < tadLength; i++) {
-                        auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f], tadLength);
+                        auto offset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
                         nd4j::math::nd4j_swap<T>(hX[offset + oldOffset], hX[offset + newOffset]);
                     }
                 }
diff --git a/libnd4j/blas/cuda/NDArray.cu b/libnd4j/blas/cuda/NDArray.cu
index f6a05c44b..1d95fd3c2 100644
--- a/libnd4j/blas/cuda/NDArray.cu
+++ b/libnd4j/blas/cuda/NDArray.cu
@@ -106,8 +106,8 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
 
     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-        shape::index2coords(zRank, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), i, zLen, coords);
-        const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coords, zRank);
+        shape::index2coords(i, zShapeInfo, coords);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         // if( (row + upper < col) || (row + lower > col) )
         if((coords[zRank - 2] + upper < coords[zRank - 1]) || (coords[zRank - 2] + lower > coords[zRank - 1]))
@@ -115,7 +115,7 @@ __global__ static void fillAsTriangularCuda(const void* vx, const Nd4jLong* xSha
         else if(vx != vz) {      // when x and z are different arrays
             if(xRank != zRank)
                 coords[0] = coords[1];
-            const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, xRank);
+            const auto xOffset = areSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords);
             z[zOffset] = x[xOffset];
         }
     }
@@ -177,8 +177,8 @@ __global__ static void identityMatrixCuda(void* vx, const Nd4jLong* xShapeInfo,
 
     for (Nd4jLong i = tid; i < len; i += totalThreads) {
 
-        shape::index2coords(rank, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), i, len, coords);
-        const auto offset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, rank);
+        shape::index2coords(i, xShapeInfo, coords);
+        const auto offset = shape::getOffset(xShapeInfo, coords);
 
         if(coords[rank - 2] == coords[rank - 1]) // row == col -> on diagonal
             x[offset] = val;
@@ -424,9 +424,9 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
 
     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         if(repSize > 1) {
             for (uint j = 0; j < repSize; ++j) {
@@ -440,7 +440,7 @@ __global__ static void repeatCuda(const void* vx, const Nd4jLong* xShapeInfo,
         else
             coords[axis] /= repeats[0];
 
-        z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+        z[zOffset] = x[shape::getOffset(xShapeInfo, coords)];
     }
 }
 
diff --git a/libnd4j/blas/cuda/NDArrayLambda.hpp b/libnd4j/blas/cuda/NDArrayLambda.hpp
index bf9848981..c27476bfb 100644
--- a/libnd4j/blas/cuda/NDArrayLambda.hpp
+++ b/libnd4j/blas/cuda/NDArrayLambda.hpp
@@ -23,8 +23,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-static Nd4jLong __device__ __noinline__ __getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) {
-    return shape::getIndexOffset(index, shapeInfo, length);
+static Nd4jLong __device__ __noinline__ __getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
+    return shape::getIndexOffset(index, shapeInfo);
 }
 
 static Nd4jLong __device__ __noinline__ __length(Nd4jLong *shapeInfo) {
@@ -103,8 +103,8 @@ static _CUDA_G void lambdaKernel(void* vx, Nd4jLong *xShapeInfo, void *vz, Nd4jL
             z[e * zEws] = lambda(x[e * xEws]);
     } else {
         for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
-            auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
-            auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
+            auto xOffset = __getIndexOffset(e, xShapeInfo);
+            auto zOffset = __getIndexOffset(e, zShapeInfo);
 
             z[zOffset] = lambda(x[xOffset]);
         }
@@ -132,8 +132,8 @@ static _CUDA_G void lambdaIndexedKernel(void* vx, Nd4jLong *xShapeInfo, void *vz
             z[e * zEws] = lambda(e, x[e * xEws]);
     } else {
         for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
-            auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
-            auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
+            auto xOffset = __getIndexOffset(e, xShapeInfo);
+            auto zOffset = __getIndexOffset(e, zShapeInfo);
 
             z[zOffset] = lambda(e, x[xOffset]);
         }
@@ -164,9 +164,9 @@ static _CUDA_G void lambdaIndexedPairwiseKernel(void* vx, Nd4jLong *xShapeInfo,
             z[e * zEws] = lambda(e, x[e * xEws], y[e * yEws]);
     } else {
         for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
-            auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
-            auto yOffset = __getIndexOffset(e, yShapeInfo, zLength);
-            auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
+            auto xOffset = __getIndexOffset(e, xShapeInfo);
+            auto yOffset = __getIndexOffset(e, yShapeInfo);
+            auto zOffset = __getIndexOffset(e, zShapeInfo);
 
             z[zOffset] = lambda(e, x[xOffset], y[yOffset]);
         }
@@ -197,9 +197,9 @@ static _CUDA_G void lambdaPairwiseKernel(void* vx, Nd4jLong *xShapeInfo, void* v
             z[e * zEws] = lambda(x[e * xEws], y[e * yEws]);
     } else {
         for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
-            auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
-            auto yOffset = __getIndexOffset(e, yShapeInfo, zLength);
-            auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
+            auto xOffset = __getIndexOffset(e, xShapeInfo);
+            auto yOffset = __getIndexOffset(e, yShapeInfo);
+            auto zOffset = __getIndexOffset(e, zShapeInfo);
 
             z[zOffset] = lambda(x[xOffset], y[yOffset]);
         }
@@ -233,10 +233,10 @@ static _CUDA_G void lambdaTriplewiseKernel(void* vw, Nd4jLong *wShapeInfo, void*
             z[e * zEws] = lambda(w[e * wEws], x[e * xEws], y[e * yEws]);
     } else {
         for (uint e = tid; e < zLength; e += blockDim.x * gridDim.x) {
-            auto wOffset = __getIndexOffset(e, wShapeInfo, zLength);
-            auto xOffset = __getIndexOffset(e, xShapeInfo, zLength);
-            auto yOffset = __getIndexOffset(e, yShapeInfo, zLength);
-            auto zOffset = __getIndexOffset(e, zShapeInfo, zLength);
+            auto wOffset = __getIndexOffset(e, wShapeInfo);
+            auto xOffset = __getIndexOffset(e, xShapeInfo);
+            auto yOffset = __getIndexOffset(e, yShapeInfo);
+            auto zOffset = __getIndexOffset(e, zShapeInfo);
 
             z[zOffset] = lambda(w[wOffset], x[xOffset], y[yOffset]);
         }
diff --git a/libnd4j/blas/cuda/NativeOps.cu b/libnd4j/blas/cuda/NativeOps.cu
index ec88de2e5..6afabfca6 100755
--- a/libnd4j/blas/cuda/NativeOps.cu
+++ b/libnd4j/blas/cuda/NativeOps.cu
@@ -3228,8 +3228,8 @@ __global__ static void scatterUpdateCuda(const int opCode, const int numOfSubArr
 
         for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
 
-            const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX);
-            const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY);
+            const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+            const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
 
             switch (opCode) {
                 case 0:
diff --git a/libnd4j/include/helpers/Loops.h b/libnd4j/include/helpers/Loops.h
index d04d3315d..392ed3edf 100644
--- a/libnd4j/include/helpers/Loops.h
+++ b/libnd4j/include/helpers/Loops.h
@@ -246,9 +246,9 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                 auto lenPerThread = static_cast<uint>(threadsInfo.getItersPerThread(threadNum));
                 PRAGMA_OMP_SIMD
                 for (uint i = 0; i < lenPerThread; i++) {
-                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
-                    auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, len, canCastY);
-                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
+                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                    auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                     z[zOffset] = op(x[xOffset], y[yOffset], extraParams);
                 }
             }
@@ -452,7 +452,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                     for (uint j = 0; j < tadLen; j++)
                         start = OpType::update(start, OpType::op(tad[j * tadEws], extraParams), extraParams);
 
-                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
                     z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
                 }
             }
@@ -469,7 +469,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                     auto start = OpType::startingValue(tad);
 
                     for (uint j = 0; j < tadLen; j++) {
-                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
+                        auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
                         start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
                     }
 
@@ -491,11 +491,11 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
             //         auto start = OpType::startingValue(tad);
 
             //         for (uint j = 0; j < tadLen; j++) {
-            //             auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
+            //             auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
             //             start = OpType::update(start, OpType::op(tad[tadOffset], extraParams), extraParams);
             //         }
 
-            //         auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
+            //         auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
             //         z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
             //     }
             // }
@@ -517,7 +517,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                     for (uint j = 0; j < tadLen; j++)
                         start = OpType::update(start, OpType::op(tad[innertadOffsets[j]], extraParams), extraParams);
 
-                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
+                    auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
                     z[zOffset] = OpType::postProcess(start, tadLen, extraParams);
                 }
 
@@ -658,13 +658,13 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
                         PRAGMA_OMP_SIMD
                         for (uint i = 0; i < lenPerThread; i++) {
-                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, len, canCastX);
+                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
                             zi[i * zEws] = OpType::op(x[xOffset], extraParams);
                         }
                     } else {
                         PRAGMA_OMP_SIMD
                         for (uint i = 0; i < lenPerThread; i++) {
-                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, len, canCastX);
+                            const auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, castXShapeInfo, canCastX);
                             zi[i] = OpType::op(x[xOffset], extraParams);
                         }
                     }
@@ -782,8 +782,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
 
                     PRAGMA_OMP_SIMD
                     for (uint i = 0; i < lenPerThread; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpType::op(x[xOffset], extraParams);
                     }
                 }
@@ -1123,7 +1123,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                         auto start      = OpType::startingValue(xTad);
 
                         for (uint j = 0; j < tadLen; ++j) {
-                            const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
+                            const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
                             start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                         }
 
@@ -1147,8 +1147,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                         auto start      = OpType::startingValue(xTad);
 
                         for (uint j = 0; j < tadLen; ++j) {
-                            const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
-                            const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, tadLen, canCastYTad);
+                            const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
+                            const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
                             start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                         }
 
@@ -1423,7 +1423,7 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                   auto start = startVal;
 
                             for (uint j = 0; j < tadLen; ++j) {
-                                const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
+                                const auto tadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
                                 start = OpType::update(start, OpType::op(xTad[tadOffset], yTad[tadOffset], extraParams), extraParams);
                             }
                             z[zInd * zEws] = OpType::postProcess(start, tadLen, extraParams);
@@ -1449,8 +1449,8 @@ void Loops::loopXYZ(const X* x, const Nd4jLong* xShapeInfo,
                                   auto start = startVal;
 
                             for (uint j = 0; j < tadLen; ++j) {
-                                const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, tadLen, canCastXTad);
-                                const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, tadLen, canCastYTad);
+                                const auto xTadOffset = shape::indexOffset(j, xTadShapeInfo, castXTadShapeInfo, canCastXTad);
+                                const auto yTadOffset = shape::indexOffset(j, yTadShapeInfo, castYTadShapeInfo, canCastYTad);
                                 start = OpType::update(start, OpType::op(xTad[xTadOffset], yTad[yTadOffset], extraParams), extraParams);
                             }
 
diff --git a/libnd4j/include/helpers/ShapeUtils.h b/libnd4j/include/helpers/ShapeUtils.h
index 1d991e36a..ba0f956a5 100644
--- a/libnd4j/include/helpers/ShapeUtils.h
+++ b/libnd4j/include/helpers/ShapeUtils.h
@@ -15,7 +15,7 @@
  ******************************************************************************/
 
 //
-// @author iuriish@yahoo.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #ifndef LIBND4J_SHAPEUTILS_H
diff --git a/libnd4j/include/helpers/TAD.h b/libnd4j/include/helpers/TAD.h
index c49f1047d..9888bb1fd 100644
--- a/libnd4j/include/helpers/TAD.h
+++ b/libnd4j/include/helpers/TAD.h
@@ -526,7 +526,7 @@ namespace shape {
         /* int *sub = new int[leftOverIndexLen];
          shape::ind2subOrder(tadShape,index,len,sub);
         */
-        shape::index2coords(leftOverIndexLen,tadShape, index,len, sub);
+        shape::index2coords(index, leftOverIndexLen,tadShape, sub);
 
 
         for(int i = 0; i < leftOverIndexLen; i++) {
@@ -609,7 +609,7 @@ namespace shape {
         if(dimensionLength > 1) {
             Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager);
 
-            Nd4jLong ret = shape::getOffset(0,shape::shapeOf(shapeInfo),shape::stride(shapeInfo),tad2Sub,shape::rank(shapeInfo));
+            Nd4jLong ret = shape::getOffset(shapeInfo, tad2Sub);
 
             if(ret < 0) {
                 if (ptrManager == nullptr)
@@ -625,7 +625,7 @@ namespace shape {
         else {
             Nd4jLong *tad2Sub = this->tad2Sub(index, ptrManager);
 
-            Nd4jLong ret = shape::getOffset(0,shape::shapeOf(shapeInfo),shape::stride(shapeInfo),tad2Sub,shape::rank(shapeInfo));
+            Nd4jLong ret = shape::getOffset(shapeInfo, tad2Sub);
 
             if (ptrManager == nullptr)
                 delete[] tad2Sub;
@@ -703,7 +703,7 @@ namespace shape {
         /* int *sub = new int[leftOverIndexLen];
          shape::ind2subOrder(tadShape,index,len,sub);
         */
-        shape::index2coords(leftOverIndexLen,tadShape,index,len, sub);
+        shape::index2coords(index, leftOverIndexLen,tadShape, sub);
 
         for(int i = 0; i < leftOverIndexLen; i++) {
             ret[leftOverIndexes[i]] = sub[i];
@@ -732,7 +732,7 @@ namespace shape {
         //    return shape::createScalarShapeInfo();
 
         //ensure tad shapes get setup right for vectors
-        if(dimensionLength > 1 && shape::isVector(shapeInfo)) 
+        if(dimensionLength > 1 && shape::isVector(shapeInfo))
             return shape::copyOf(shape::shapeInfoLength(shape::rank(shapeInfo)),shapeInfo);
 
         // case when tad coincides with whole array
diff --git a/libnd4j/include/helpers/benchmark/ParametersBatch.h b/libnd4j/include/helpers/benchmark/ParametersBatch.h
index 5a45099c3..4a7119937 100644
--- a/libnd4j/include/helpers/benchmark/ParametersBatch.h
+++ b/libnd4j/include/helpers/benchmark/ParametersBatch.h
@@ -64,7 +64,7 @@ namespace nd4j {
 
 
             for (int i = 0; i < totalIterations; i++) {
-                shape::index2coords(xRank, xShape, i, totalIterations, xCoords);
+                shape::index2coords(i, xRank, xShape, xCoords);
 
                 Parameters params;
                 for (int j = 0; j < xRank; j++) {
diff --git a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
index 0a096b65f..22ff3e6b1 100644
--- a/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
+++ b/libnd4j/include/helpers/cpu/loops/IndexReductionLoops.cpp
@@ -226,7 +226,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
                     indexValue = OpType::update(indexValue, comp, extraParams);
                 }
 
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
+                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
                 z[zOffset] = (Z) indexValue.index;
             }
         }
@@ -243,7 +243,7 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
                 auto indexValue = OpType::startingIndexValue(tad);
 
                 for (uint j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
+                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
                     functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
                     indexValue = OpType::update(indexValue, comp, extraParams);
                 }
@@ -266,12 +266,12 @@ void nd4j::IndexReductionLoops<X,Z>::loopIndexReduce(X* x, Nd4jLong* xShapeInfo,
                 auto indexValue = OpType::startingIndexValue(tad);
 
                 for (uint j = 0; j < tadLen; j++) {
-                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, tadLen, canCastTad);
+                    auto tadOffset = shape::indexOffset(j, tadShapeInfo, castTadShapeInfo, canCastTad);
                     functions::indexreduce::IndexValue<X> comp(tad[tadOffset], j);
                     indexValue = OpType::update(indexValue, comp, extraParams);
                 }
 
-                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, zLen, canCastZ);
+                auto zOffset = shape::indexOffset(i, zShapeInfo, castZShapeInfo, canCastZ);
                 z[zOffset] = (Z) indexValue.index;
             }
         }
diff --git a/libnd4j/include/helpers/impl/ShapeUtils.cpp b/libnd4j/include/helpers/impl/ShapeUtils.cpp
index 6d93351c3..91ee09123 100644
--- a/libnd4j/include/helpers/impl/ShapeUtils.cpp
+++ b/libnd4j/include/helpers/impl/ShapeUtils.cpp
@@ -15,7 +15,7 @@
  ******************************************************************************/
 
 //
-// @author Yurii Shyrma
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <algorithm>
@@ -931,7 +931,7 @@ void ShapeUtils::evalIdxRangesForSubArr(const Nd4jLong subArrIdx,  const Nd4jLon
     for(int i = 0; i < subArrRank; ++i)
         shapeOfSubArr[i] = shapeInfo[dimsToExclude[i] + 1];
 
-    shape::index2coords(subArrRank, shapeOfSubArr.data(), subArrIdx, indexes.data());
+    shape::index2coords(subArrIdx, subArrRank, shapeOfSubArr.data(), indexes.data());
 
     memset(idxRanges, 0, 2 * rank * sizeof(Nd4jLong));
 
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index 705f06b99..cae5f0fa9 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -887,7 +887,7 @@ namespace shape {
 * @param indices the indices to iterate over
 * @return the double at the specified index
 */
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(Nd4jLong baseOffset, const Nd4jLong *shape, const Nd4jLong *stride,  const Nd4jLong *indices, const int rank);
+
     ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
     ND4J_EXPORT Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices);
 
@@ -897,20 +897,19 @@ namespace shape {
 
     /**
     * Convert a linear index to the corresponding coordinates
-    * for example if shape is {2, 4}, then index 5 corresponds to following coordinates
-    * -> [1, 1] in case of c order
-    * -> [1, 2] in case of f order
+    * for example if shape is {2, 4}, then index 5 corresponds to coordinates [1, 1]
     */
-    ND4J_EXPORT _CUDA_HD void index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong arrLen, Nd4jLong *coords, const char order = 'c');
-    ND4J_EXPORT _CUDA_HD void index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong *coords, const char order = 'c');
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const Nd4jLong *shapeInfo,  Nd4jLong *coords);
+    ND4J_EXPORT _CUDA_HD void index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords);
+
+
 
     /**
     * Convert coordinates to the corresponding linear index (sequence number in other words)
-    * for example if shape is {2, 4}, then:
-    * in case of c order and coordinates [1, 1] index 5 is returned
-    * in case of f order and coordinates [1, 2] index 5 is returned
+    * for example if shape is {2, 4} and coordinates [1, 1] then index 5 is returned
     */
-    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords, const char order = 'c');
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *coords);
+    ND4J_EXPORT _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *coords);
 
    /**
    * increment n-dimensional array by one iteration by changing coord appropriately
@@ -921,24 +920,10 @@ namespace shape {
    */
 
    /* calculates an array buffer offset for given "index" using following formula: offset = coord_0*stride_0 + coord_1*stride_1 + ... + coord_{rank-1}*stride_{rank-1}
-    * arrLen - array length
    */
-    ND4J_EXPORT _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOrderOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen, const char order);
-    ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, Nd4jLong arrLen, const bool useUnsigned);
-
-    /**
-   * Compute the real linear indices for the given shape and stride
-   */
-    ND4J_EXPORT _CUDA_HD Nd4jLong *computeIndices(int rank,  Nd4jLong *shape,  Nd4jLong *stride);
-
-    /**
-   * Compute the real linear indices for the
-     * given shape buffer. Shape,stride and rank are derived
-     * from the buffer
-   */
-    ND4J_EXPORT _CUDA_HD Nd4jLong *computeIndices( Nd4jLong *shapeBuffer);
+    ND4J_EXPORT _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo);
+    ND4J_EXPORT _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned);
 
     ND4J_EXPORT _CUDA_HD void printShapeInfo(Nd4jLong *shapeInfo);
 
@@ -1749,57 +1734,34 @@ __device__ INLINEDEF Nd4jLong *cuMalloc(Nd4jLong *buffer, long size) {
         return output;
     }
 
-/**
- * Compute the real linear indices for the given shape and stride
- */
-    INLINEDEF _CUDA_HD Nd4jLong *computeIndices(int rank, Nd4jLong *shape,  Nd4jLong *stride) {
-        Nd4jLong length = shape::prodLong(shape,rank);
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const Nd4jLong *shapeInfo, const Nd4jLong *indices) {
 
-        traceNew(13);
+    Nd4jLong index, shift = 1;;
 
-        Nd4jLong *ret = new Nd4jLong[length];
-        for(int i = 0; i < length; i++) {
-            Nd4jLong *idx = new Nd4jLong[rank];
-            shape::index2coords(rank, shape, i, idx, 'f');
-            ret[i] = shape::getOffset(0, shape, stride, idx, rank);
-            delete[] idx;
-        }
-
-        return ret;
-    }
-
-/**
-* Compute the real linear indices for the given shape and stride
-*/
-    INLINEDEF _CUDA_HD Nd4jLong *computeIndices(Nd4jLong *shapeBuffer) {
-        return computeIndices(shape::rank(shapeBuffer),shape::shapeOf(shapeBuffer),shape::stride(shapeBuffer));
+    index = indices[shapeInfo[0] - 1];
+    for(uint i = shapeInfo[0]; i > 1; --i) {
+        shift *= shapeInfo[i];
+        index += shift * indices[i - 2];
     }
 
+    return index;
+}
 
 //////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices, const char order) {
+INLINEDEF _CUDA_HD Nd4jLong coords2index(const int rank, const Nd4jLong *shape, const Nd4jLong *indices) {
 
-        Nd4jLong index, shift = 1;;
+    Nd4jLong index, shift = 1;;
 
-        if(order == 'c') {
-
-            index = indices[rank - 1];
-            for(int i = rank - 2; i >= 0; --i) {
-                shift *= shape[i + 1];
-                index += shift * indices[i];
-            }
-        }
-        else {
-            index = indices[0];
-            for(int i = 1; i < rank; ++i) {
-                shift *= shape[i - 1];
-                index += shift * indices[i];
-            }
-        }
-
-        return index;
+    index = indices[rank - 1];
+    for(uint i = rank - 1; i >= 1; --i) {
+        shift *= shape[i];
+        index += shift * indices[i - 1];
     }
 
+    return index;
+}
+
 template <typename T>
  INLINEDEF _CUDA_HD void fill(T* buffer, T value, Nd4jLong length) {
 
@@ -1809,85 +1771,110 @@ template <typename T>
  }
 
 
-//////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen) {
+// //////////////////////////////////////////////////////////////////////
+//     INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen) {
 
-        const Nd4jLong ews = shapeInfo[shapeInfo[0] + shapeInfo[0] + 2];
+//         const Nd4jLong ews = shapeInfo[shapeInfo[0] + shapeInfo[0] + 2];
 
-        if(ews > 0 && order(shapeInfo) == 'c')
-           if (ews == 1)
-               return index;
-           else
-               return ews * index;
+//         if(ews > 0 && order(shapeInfo) == 'c')
+//            if (ews == 1)
+//                return index;
+//            else
+//                return ews * index;
 
-        Nd4jLong offset = 0;
-        Nd4jLong rank = shapeInfo[0];
-        for(int i = 1; i <= shapeInfo[0]; ++i) {
-            arrLen /= shapeInfo[i];
-            if(arrLen > 0 && shapeInfo[i] > 1) {
-                offset += (index / arrLen) * shapeInfo[i + rank];
-                index %= arrLen;
-            }
-        }
-        return offset;
-    }
+//         Nd4jLong offset = 0;
+//         Nd4jLong rank = shapeInfo[0];
+//         for(int i = 1; i <= shapeInfo[0]; ++i) {
+//             arrLen /= shapeInfo[i];
+//             if(arrLen > 0 && shapeInfo[i] > 1) {
+//                 offset += (index / arrLen) * shapeInfo[i + rank];
+//                 index %= arrLen;
+//             }
+//         }
+//         return offset;
+//     }
 
-    INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen) {
+//     INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo, uint arrLen) {
 
-        const uint rank = shapeInfo[0];
-        const uint ews = shapeInfo[rank + rank + 2];
+//         const uint rank = shapeInfo[0];
+//         const uint ews = shapeInfo[rank + rank + 2];
 
-        if(ews > 0 && shapeInfo[rank + rank + 3] == 99)
-           if (ews == 1)
-               return index;
-           else
-               return ews * index;
+//         if(ews > 0 && shapeInfo[rank + rank + 3] == 99)
+//            if (ews == 1)
+//                return index;
+//            else
+//                return ews * index;
 
-        uint offset = 0;
+//         uint offset = 0;
 
-        for(uint i = 1; i <= rank; ++i) {
-            arrLen /= shapeInfo[i];
-            if(arrLen > 0 && shapeInfo[i] > 1) {
-                offset += (index / arrLen) * shapeInfo[i + rank];
-                index %= arrLen;
-            }
-        }
-        return offset;
-    }
-
-    INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, Nd4jLong arrLen, const bool useUnsigned) {
-
-        if(useUnsigned)
-            return getIndexOffset(static_cast<uint>(index), uShapeInfo, static_cast<uint>(arrLen));
-
-        return getIndexOffset(index, lShapeInfo, arrLen);
-    }
+//         for(uint i = 1; i <= rank; ++i) {
+//             arrLen /= shapeInfo[i];
+//             if(arrLen > 0 && shapeInfo[i] > 1) {
+//                 offset += (index / arrLen) * shapeInfo[i + rank];
+//                 index %= arrLen;
+//             }
+//         }
+//         return offset;
+//     }
 
 //////////////////////////////////////////////////////////////////////
-    INLINEDEF _CUDA_HD Nd4jLong getIndexOrderOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen, const char order) {
+INLINEDEF _CUDA_HD Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo) {
 
-        Nd4jLong offset = 0;
-        if(order == 'c') {
-            for(int i = 1; i <= *shapeInfo; ++i) {
-                arrLen /= shapeInfo[i];
-                if(arrLen > 0 && shapeInfo[i] > 1) {
-                    offset += (index / arrLen) * shapeInfo[i + *shapeInfo];
-                    index %= arrLen;
-                }
-            }
-        }
-        else {
-            for(int i = *shapeInfo; i >= 1 ; --i) {
-                arrLen /= shapeInfo[i];
-                if(arrLen > 0 && shapeInfo[i] > 1) {
-                    offset += (index / arrLen) * shapeInfo[i + *shapeInfo];
-                    index %= arrLen;
-                }
-            }
-        }
-        return offset;
+    if (shapeInfo[2 * shapeInfo[0] + 3] == 99) {
+
+        const Nd4jLong ews = shapeInfo[2 * shapeInfo[0] + 2];
+        if (ews == 1)
+            return index;
+        else if(ews > 1)
+            return ews * index;
     }
 
+    Nd4jLong offset = 0;
+
+    for(uint i = shapeInfo[0]; i > 1; --i) {
+        offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]];
+        index /= shapeInfo[i];
+    }
+
+    offset += index * shapeInfo[1 + shapeInfo[0]];  // last iteration
+
+    return offset;
+}
+
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD uint getIndexOffset(uint index, const uint *shapeInfo) {
+
+    if (shapeInfo[2 * shapeInfo[0] + 3] == 99) {
+
+        const Nd4jLong ews = shapeInfo[2 * shapeInfo[0] + 2];
+        if (ews == 1)
+            return index;
+        else if(ews > 1)
+            return ews * index;
+    }
+
+    uint offset = 0;
+
+    for(uint i = shapeInfo[0]; i > 1; --i) {
+        offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]];
+        index /= shapeInfo[i];
+    }
+
+    offset += index * shapeInfo[1 + shapeInfo[0]];  // last iteration
+
+    return offset;
+}
+
+
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeInfo, const uint* uShapeInfo, const bool useUnsigned) {
+
+    if(useUnsigned)
+        return getIndexOffset(static_cast<uint>(index), uShapeInfo);
+
+    return getIndexOffset(index, lShapeInfo);
+}
+
 /**
  *
  * @param length
@@ -2394,7 +2381,7 @@ template <typename T>
         auto indices = new Nd4jLong[rank];
         memset((void *) indices,0,rank * sizeof(Nd4jLong));
         indices[0] = sliceIdx;
-        Nd4jLong offset = shape::getOffset(0,newShape,newStride,indices,rank);
+        Nd4jLong offset = shape::getOffset(newShapeBuffer, indices);
         newShapeBuffer[shape::shapeInfoLength(newRank) - 3] = offset;
 
         // set current order and ews
@@ -3201,30 +3188,30 @@ INLINEDEF _CUDA_HD bool haveSameShapeAndStrides(const Nd4jLong *shapeInfo1, cons
 * @param indices the indices to iterate over
 * @return the double at the specified index
 */
-    INLINEDEF _CUDA_HD Nd4jLong getOffset(Nd4jLong baseOffset, const Nd4jLong *shape,  const Nd4jLong *stride,  const Nd4jLong *indices, const int rank) {
-        Nd4jLong offset = baseOffset;
-        for(int i = 0; i < rank; i++) {
-            if(shape[i] != 1)
-                offset += indices[i] * stride[i];
-        }
 
-        return offset;
-    }
+//////////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset) {
 
-    INLINEDEF _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset) {
-        return shape::getOffset(baseOffset, shape::shapeOf(const_cast<Nd4jLong*>(shapeInfo)), shape::stride(const_cast<Nd4jLong*>(shapeInfo)), indices, shapeInfo[0]);
-    }
+    Nd4jLong offset = baseOffset;
 
-    INLINEDEF Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices) {
+    for(uint i = 1; i <= shapeInfo[0]; ++i)
+        if(shapeInfo[i] != 1)
+            offset += indices[i - 1] * shapeInfo[shapeInfo[0] + i];
 
-        Nd4jLong offset = 0;
+    return offset;
+}
 
-        for(uint i = 0; i < shapeInfo[0]; ++i)
-            if(shapeInfo[i + 1] != 1)
-                offset += indices[i] * shapeInfo[shapeInfo[0] + i + 1];
+//////////////////////////////////////////////////////////////////////////
+INLINEDEF Nd4jLong getOffset(const Nd4jLong *shapeInfo, const std::vector<uint>& indices) {
 
-        return offset;
-    }
+    Nd4jLong offset = 0;
+
+    for(uint i = 1; i <= shapeInfo[0]; ++i)
+        if(shapeInfo[i] != 1)
+            offset += indices[i - 1] * shapeInfo[shapeInfo[0] + i];
+
+    return offset;
+}
 
 
 
@@ -4209,24 +4196,24 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
     INLINEDEF _CUDA_HD Nd4jLong subArrayIndex(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
 
         Nd4jLong maxIdxs[MAX_RANK];
-        shape::index2coords(shape::rank(maxShapeInfo), const_cast<Nd4jLong *>(maxShapeInfo)+1, const_cast<Nd4jLong&>(maxIdx), maxIdxs, shape::order(maxShapeInfo));
+        shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
 
         Nd4jLong minIdxs[MAX_RANK];
         maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
 
-        return coords2index(shape::rank(minShapeInfo), minShapeInfo + 1, minIdxs);
+        return shape::coords2index(minShapeInfo, minIdxs);
     }
 
     //////////////////////////////////////////////////////////////////////
     INLINEDEF _CUDA_HD Nd4jLong subArrayOffset(const Nd4jLong maxIdx, const Nd4jLong* maxShapeInfo, const Nd4jLong* minShapeInfo, const int* dimsToExclude, const int dimsLen) {
 
         Nd4jLong maxIdxs[MAX_RANK];
-        shape::index2coords(shape::rank(maxShapeInfo), const_cast<Nd4jLong *>(maxShapeInfo)+1, const_cast<Nd4jLong&>(maxIdx), maxIdxs, shape::order(maxShapeInfo));
+        shape::index2coords(const_cast<Nd4jLong&>(maxIdx), maxShapeInfo, maxIdxs);
 
         Nd4jLong minIdxs[MAX_RANK];
         maxIndToMinInd(maxIdxs, minIdxs, maxShapeInfo, minShapeInfo, dimsToExclude, dimsLen);
 
-        return getOffset(0, minShapeInfo + 1,  minShapeInfo + shape::rank(minShapeInfo) + 1, minIdxs, shape::rank(minShapeInfo));
+        return getOffset(minShapeInfo, minIdxs);
     }
 
     //////////////////////////////////////////////////////////////////////
@@ -4246,7 +4233,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
         int N, minI, maxI;
 
         // calculate min per-dim-indices which corresponds to absolute minIdx index
-        shape::index2coords(rankMin, minShapeInfo + 1, minIdx, indices, order(minShapeInfo));
+        shape::index2coords(minIdx, minShapeInfo, indices);
 
         // transform storage indices to contain per-dim max indices, purpose - memory saving
         // fill increment array as well
@@ -4277,7 +4264,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
         maxI = rankMax-1;
         N = 0;
         int step;
-        maxOffsets[N++] = shape::getOffset(0, maxShapeInfo + 1,  maxShapeInfo + rankMax + 1, indices, rankMax);
+        maxOffsets[N++] = shape::getOffset(maxShapeInfo, indices);
 
         // nested loops - producing of absolute indices for max array
         while(maxI >= 0) {
@@ -4290,7 +4277,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
                     step = -1;
                 }
                 else {
-                    maxOffsets[N++] = shape::getOffset(0, maxShapeInfo + 1,  maxShapeInfo + rankMax + 1, indices, rankMax);
+                    maxOffsets[N++] = shape::getOffset(maxShapeInfo, indices);
                     step =  rankMax - 1 - maxI;
                 }
             }
@@ -4322,7 +4309,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
         int N, minI, maxI;
 
         // calculate min per-dim-indices which corresponds to absolute minIdx index
-        shape::index2coords(rankMin, minShapeInfo + 1, minIdx, indices, order(minShapeInfo));
+        shape::index2coords(minIdx, minShapeInfo, indices);
 
         // transform storage indices to contain per-dim max indices, purpose - memory saving
         // fill increment array as well
@@ -4353,7 +4340,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
         maxI = rankMax-1;
         N = 0;
         int step;
-        maxIdxs[N++] = coords2index(rankMax, maxShapeInfo + 1, indices);
+        maxIdxs[N++] = shape::coords2index(maxShapeInfo, indices);
 
         // nested loops - producing of absolute indices for max array
         while(maxI >= 0) {
@@ -4366,7 +4353,7 @@ INLINEDEF _CUDA_HD void maxIndToMinInd(Nd4jLong* maxIdxs, Nd4jLong* minIdxs, con
                     step = -1;
                 }
                 else {
-                    maxIdxs[N++] = coords2index(rankMax, maxShapeInfo + 1, indices);
+                    maxIdxs[N++] = shape::coords2index(maxShapeInfo, indices);
                     step =  rankMax - 1 - maxI;
                 }
             }
@@ -4693,37 +4680,23 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong *coords, const char order) {
-    Nd4jLong arrLen = shape::prodLong(shape, rank);
-    shape::index2coords(rank, shape, index, arrLen, coords, order);
+INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong *coords) {
+
+    for(uint i = shapeInfo[0]; i > 1; --i) {
+        coords[i - 1] = index % shapeInfo[i];
+        index /= shapeInfo[i];
+    }
+    coords[0] = index;      // last iteration
 }
 
-INLINEDEF void _CUDA_HD index2coords(const int rank, const Nd4jLong *shape, Nd4jLong index, Nd4jLong arrLen, Nd4jLong *coords, const char order) {
+//////////////////////////////////////////////////////////////////////
+INLINEDEF void _CUDA_HD index2coords(Nd4jLong index, const int rank, const Nd4jLong *shape, Nd4jLong *coords) {
 
-    if(order == 'c') {
-
-        for(int i = 0; i < rank; i++) {
-            arrLen /= shape[i];
-            if(arrLen > 0 && shape[i] > 1) {
-                coords[i] = index / arrLen;
-                index %= arrLen;
-            }
-            else
-                coords[i] = 0;
-        }
-    }
-    else {
-
-        for(int i = rank - 1; i >= 0; i--) {
-            arrLen /= shape[i];
-            if(arrLen > 0 && shape[i] > 1) {
-                coords[i] = index / arrLen;
-                index %= arrLen;
-            }
-            else
-                coords[i] = 0;
-        }
+    for(uint i = rank - 1; i > 0; --i) {
+        coords[i] = index % shape[i];
+        index /= shape[i];
     }
+    coords[0] = index;      // last iteration
 }
 
 //////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cpu/broadcasting.hpp b/libnd4j/include/loops/cpu/broadcasting.hpp
index dce9ca54b..3bd619827 100644
--- a/libnd4j/include/loops/cpu/broadcasting.hpp
+++ b/libnd4j/include/loops/cpu/broadcasting.hpp
@@ -170,13 +170,13 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                        
+
                         auto oX = x + tadOffsets[i];
                         auto oZ = z + zTadOffset[i];
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int f = 0; f < tadLength; f++) {
-                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
@@ -190,14 +190,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
@@ -211,14 +211,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
+                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
                     }
@@ -232,14 +232,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
+                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
                     }
@@ -255,15 +255,15 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
+                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
                     }
@@ -362,7 +362,7 @@ namespace functions {
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int f = 0; f < tadLength; f++) {
-                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
+                        auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                         oZ[offset] = OpType::op(x[offset], oY[offset]);
                     }
                 }
@@ -382,8 +382,8 @@ namespace functions {
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                        auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                     }
                 }
@@ -403,8 +403,8 @@ namespace functions {
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                        auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, lenX, canCastX);
+                        auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto xOffset = shape::indexOffset(f, yShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                     }
                 }
@@ -424,8 +424,8 @@ namespace functions {
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                        auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
+                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto offset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                         oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                     }
                 }
@@ -447,9 +447,9 @@ namespace functions {
 
                     PRAGMA_OMP_SIMD
                     for (int f = 0; f < tadLength; f++) {
-                        auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
-                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                        auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                        auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                        auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                         oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                     }
                 }
diff --git a/libnd4j/include/loops/cpu/broadcasting_bool.cpp b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
index 54950951c..bca423e3e 100644
--- a/libnd4j/include/loops/cpu/broadcasting_bool.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_bool.cpp
@@ -126,7 +126,7 @@ namespace functions {
                 if (zTadShapeInfo == nullptr) {
                     zTadShapeInfo = xTadShapeShapeInfo;
                     zTadOffset = tadOffsets;
-                }                                
+                }
 
                 auto lenZ = shape::length(zTadShapeInfo);
                 auto lenY = shape::length(yShapeInfo);
@@ -140,7 +140,7 @@ namespace functions {
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
-                
+
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
@@ -170,15 +170,15 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
-                                        
+
                         // TODO: cover this codebranch with tests
                         // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
@@ -192,14 +192,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
@@ -213,14 +213,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
+                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
                     }
@@ -234,14 +234,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
+                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
                     }
@@ -257,15 +257,15 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
+                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
                     }
@@ -365,7 +365,7 @@ namespace functions {
                         // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(x[offset], oY[offset]);
                         }
                     }
@@ -385,8 +385,8 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                         }
                     }
@@ -406,8 +406,8 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
+                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                         }
                     }
@@ -427,8 +427,8 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
+                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                         }
                     }
@@ -450,9 +450,9 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
-                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                         }
                     }
diff --git a/libnd4j/include/loops/cpu/broadcasting_int.cpp b/libnd4j/include/loops/cpu/broadcasting_int.cpp
index c092da50b..375d7577a 100644
--- a/libnd4j/include/loops/cpu/broadcasting_int.cpp
+++ b/libnd4j/include/loops/cpu/broadcasting_int.cpp
@@ -126,7 +126,7 @@ namespace functions {
                 if (zTadShapeInfo == nullptr) {
                     zTadShapeInfo = xTadShapeShapeInfo;
                     zTadOffset = tadOffsets;
-                }                                
+                }
 
                 auto lenZ = shape::length(zTadShapeInfo);
                 auto lenY = shape::length(yShapeInfo);
@@ -140,7 +140,7 @@ namespace functions {
                 auto zEws = shape::elementWiseStride(zTadShapeInfo);
 
                 const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xTadShapeShapeInfo, yShapeInfo, zTadShapeInfo);
-                
+
                 if (kindOfLoop == nd4j::LoopKind::EWS1) {
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
@@ -170,15 +170,15 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
-                                        
+
                         // TODO: cover this codebranch with tests
                         // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
+                            auto offset = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
@@ -192,14 +192,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[offset], y[offset]);
                         }
                     }
@@ -213,14 +213,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
+                            auto offset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[offset], y[yOffset]);
                         }
                     }
@@ -234,14 +234,14 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
+                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto offset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(oX[xOffset], y[offset]);
                         }
                     }
@@ -257,15 +257,15 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_THREADS(threads)
                     for (int i = 0; i < tads; i++) {
-                    
+
                         auto oZ = z + zTadOffset[i];
                         auto oX = x + tadOffsets[i];
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastX);
-                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, lenY, canCastY);
-                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto xOffset  = shape::indexOffset(f, xTadShapeShapeInfo, tadShapeShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yShapeInfo, yShapeInfoCast, canCastY);
+                            auto zOffset  = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(oX[xOffset], y[yOffset]);
                         }
                     }
@@ -365,7 +365,7 @@ namespace functions {
                         // all this stuff already happens within thread
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
+                            auto offset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
                             oZ[offset] = OpType::op(x[offset], oY[offset]);
                         }
                     }
@@ -385,8 +385,8 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[offset], oY[offset]);
                         }
                     }
@@ -406,8 +406,8 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
+                            auto offset  = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[xOffset], oY[offset]);
                         }
                     }
@@ -427,8 +427,8 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
+                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto offset  = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
                             oZ[offset] = OpType::op(x[offset], oY[yOffset]);
                         }
                     }
@@ -450,9 +450,9 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for (int f = 0; f < tadLength; f++) {
-                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, lenX, canCastX);
-                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCastY);
-                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, lenZ, canCastZ);
+                            auto xOffset = shape::indexOffset(f, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(f, yTadShapeShapeInfo, tadShapeShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(f, zTadShapeInfo, tadShapeInfoZCast, canCastZ);
                             oZ[zOffset] = OpType::op(x[xOffset], oY[yOffset]);
                         }
                     }
diff --git a/libnd4j/include/loops/cpu/indexreduce.cpp b/libnd4j/include/loops/cpu/indexreduce.cpp
index 5a7beee24..23286ecd9 100644
--- a/libnd4j/include/loops/cpu/indexreduce.cpp
+++ b/libnd4j/include/loops/cpu/indexreduce.cpp
@@ -92,7 +92,7 @@ Nd4jLong IndexReduce<X, Y>::execScalar(void *vx, Nd4jLong *xShapeInfo, void *vex
             auto ulen = info.getItersPerThread(threadNum);
 
             for (Nd4jLong i = 0; i < ulen; i++) {
-                auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, len, canCastX);
+                auto offset = shape::indexOffset(threadOffset + i, xShapeInfo, xShapeInfoCast, canCastX);
                 IndexValue<X> curr(x[offset], threadOffset + i);
                 local = OpType::update(local, curr, extraParams);
             }
diff --git a/libnd4j/include/loops/cpu/pairwise.hpp b/libnd4j/include/loops/cpu/pairwise.hpp
index 6b0c8cb49..9dfa129aa 100644
--- a/libnd4j/include/loops/cpu/pairwise.hpp
+++ b/libnd4j/include/loops/cpu/pairwise.hpp
@@ -137,7 +137,7 @@ namespace functions {
                 void *vz,
                 Nd4jLong* zShapeInfo,
                 void *vextraParams) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<Y *>(vy);
             auto z = reinterpret_cast<Z *>(vz);
@@ -152,13 +152,13 @@ namespace functions {
 
             if (shape::isScalar(yShapeInfo)) {
 
-                uint xShapeInfoCast[MAX_RANK];                    
+                uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
-                                    
+
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
 
@@ -166,25 +166,25 @@ namespace functions {
 
                         PRAGMA_OMP_SIMD
                         for(unsigned int i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
+                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpType::op(x[offset], y[0], extraParams);
                         }
                     }
                 }
                 else {
-                    uint zShapeInfoCast[MAX_RANK];                    
+                    uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for(unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
                         }
                     }
@@ -192,18 +192,18 @@ namespace functions {
                 return;
             }
 
-            
+
 
             const nd4j::LoopKind::Kind kindOfLoop = nd4j::LoopKind::deduceKindOfLoopXYZ(xShapeInfo, yShapeInfo, zShapeInfo);
             const bool sameShapesXY = shape::shapeEquals(xShapeInfo, yShapeInfo);
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
-            }            
+            }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
-            }                
-            else {                
+            }
+            else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -211,14 +211,14 @@ namespace functions {
                     bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
+                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpType::op(x[offset], y[offset], extraParams);
                         }
                     }
@@ -231,15 +231,15 @@ namespace functions {
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
                         }
                     }
@@ -252,15 +252,15 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
+                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                             z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
                         }
                     }
@@ -273,15 +273,15 @@ namespace functions {
                     bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                             z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
                         }
                     }
@@ -296,16 +296,16 @@ namespace functions {
                     bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (unsigned int i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
                         }
                     }
diff --git a/libnd4j/include/loops/cpu/pairwise_bool.cpp b/libnd4j/include/loops/cpu/pairwise_bool.cpp
index 30d093bce..8feabb98a 100644
--- a/libnd4j/include/loops/cpu/pairwise_bool.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_bool.cpp
@@ -61,7 +61,7 @@ namespace functions {
                                               Nd4jLong zEws,
                                               void *vextraParams,
                                               const Nd4jLong n) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
             auto z = reinterpret_cast<Z *>(vz);
@@ -72,9 +72,9 @@ namespace functions {
             if (xEws == 1 && yEws == 1 && zEws == 1) {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
+                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
                     auto xi = x + threadOffset;
                     auto yi = y + threadOffset;
                     auto zi = z + threadOffset;
@@ -88,9 +88,9 @@ namespace functions {
             else {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
+                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
                     auto xi = x + xEws*threadOffset;
                     auto yi = y + yEws*threadOffset;
                     auto zi = z + zEws*threadOffset;
@@ -151,33 +151,33 @@ namespace functions {
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
+                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpType::op(x[offset], y[0], extraParams);
                         }
                     }
                 }
                 else {
-                    
+
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
                         }
                     }
@@ -190,11 +190,11 @@ namespace functions {
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
-            }            
+            }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
             }
-            else {                
+            else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -202,83 +202,83 @@ namespace functions {
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
+                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpType::op(x[offset], y[offset], extraParams);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
+                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                             z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                             z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
                         }
                     }
                 }
                 else {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -287,16 +287,16 @@ namespace functions {
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
                         }
                     }
diff --git a/libnd4j/include/loops/cpu/pairwise_int.cpp b/libnd4j/include/loops/cpu/pairwise_int.cpp
index b356adcc2..63b9dc8c8 100644
--- a/libnd4j/include/loops/cpu/pairwise_int.cpp
+++ b/libnd4j/include/loops/cpu/pairwise_int.cpp
@@ -61,7 +61,7 @@ namespace functions {
                                               Nd4jLong zEws,
                                               void *vextraParams,
                                               const Nd4jLong n) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto y = reinterpret_cast<X *>(vy);
             auto z = reinterpret_cast<X *>(vz);
@@ -72,9 +72,9 @@ namespace functions {
             if (xEws == 1 && yEws == 1 && zEws == 1) {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
+                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
                     auto xi = x + threadOffset;
                     auto yi = y + threadOffset;
                     auto zi = z + threadOffset;
@@ -88,9 +88,9 @@ namespace functions {
             else {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
-                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);        
+                    Nd4jLong threadOffset = info.getThreadOffset(threadNum);
                     auto xi = x + xEws*threadOffset;
                     auto yi = y + yEws*threadOffset;
                     auto zi = z + zEws*threadOffset;
@@ -151,33 +151,33 @@ namespace functions {
                 if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
+                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpType::op(x[offset], y[0], extraParams);
                         }
                     }
                 }
                 else {
-                    
+
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for(Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[xOffset], y[0], extraParams);
                         }
                     }
@@ -190,11 +190,11 @@ namespace functions {
 
             if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && sameShapesXY) {
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, n);
-            }            
+            }
             else if ((kindOfLoop == nd4j::LoopKind::EWS1 || kindOfLoop == nd4j::LoopKind::EWSNONZERO) && !sameShapesXY) { //not same shape
                 exec<OpType>(x, xEws, y, yEws, z, zEws, extraParams, shape::length(yShapeInfo));
             }
-            else {                
+            else {
 
                 if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
@@ -202,83 +202,83 @@ namespace functions {
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);
+                            auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                             z[offset] = OpType::op(x[offset], y[offset], extraParams);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo)) {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[offset], y[offset], extraParams);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
+                            auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                             z[offset] = OpType::op(x[offset], y[yOffset], extraParams);
                         }
                     }
                 }
                 else if(shape::haveSameShapeAndStrides(yShapeInfo, zShapeInfo)) {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                     const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                             z[offset] = OpType::op(x[xOffset], y[offset], extraParams);
                         }
                     }
                 }
                 else {
-                    
+
                     uint xShapeInfoCast[MAX_RANK];
                     uint yShapeInfoCast[MAX_RANK];
                     uint zShapeInfoCast[MAX_RANK];
@@ -287,16 +287,16 @@ namespace functions {
                     const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                    {                
+                    {
                         auto threadNum = omp_get_thread_num();
                         auto threadOffset = info.getThreadOffset(threadNum);
                         auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                         PRAGMA_OMP_SIMD
                         for (Nd4jLong i = 0; i < ulen; i++)  {
-                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, n, canCastX);                            
-                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, n, canCastY);
-                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, n, canCastZ);
+                            auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                            auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                            auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                             z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
                         }
                     }
diff --git a/libnd4j/include/loops/cpu/random.cpp b/libnd4j/include/loops/cpu/random.cpp
index 30bab1327..5abc1447a 100644
--- a/libnd4j/include/loops/cpu/random.cpp
+++ b/libnd4j/include/loops/cpu/random.cpp
@@ -50,27 +50,27 @@ namespace functions {
                 return;
             }
 
-            auto length = shape::length(zShapeInfo);            
+            auto length = shape::length(zShapeInfo);
 
 //            nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             nd4j::OmpLaunchHelper info(length);
 
-           
+
             if(shape::haveSameShapeAndStrides(xShapeInfo, yShapeInfo) && shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                 uint xShapeInfoCast[MAX_RANK];
                 const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
+                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                     }
                 }
@@ -79,19 +79,19 @@ namespace functions {
 
                 uint xShapeInfoCast[MAX_RANK];
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);                
+                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
+                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[offset], y[offset], i, length, rng, extraArguments);
                     }
                 }
@@ -100,19 +100,19 @@ namespace functions {
 
                 uint xShapeInfoCast[MAX_RANK];
                 uint yShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);                
+                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY);
+                        auto offset  = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[offset], y[yOffset], i, length, rng, extraArguments);
                     }
                 }
@@ -121,19 +121,19 @@ namespace functions {
 
                 uint xShapeInfoCast[MAX_RANK];
                 uint yShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);                
+                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
-                    for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++)  {                        
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
-                        auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY);
+                    for (Nd4jLong i = 0; i < info.getItersPerThread(threadNum); i++)  {
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto offset  = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
                         z[offset] = OpClass::op(x[xOffset], y[offset], i, length, rng, extraArguments);
                     }
                 }
@@ -143,21 +143,21 @@ namespace functions {
                 uint xShapeInfoCast[MAX_RANK];
                 uint yShapeInfoCast[MAX_RANK];
                 uint zShapeInfoCast[MAX_RANK];
-                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);                
+                const bool canCastX = nd4j::DataTypeUtils::castShapeInfo(xShapeInfo, xShapeInfoCast);
                 const bool canCastY = nd4j::DataTypeUtils::castShapeInfo(yShapeInfo, yShapeInfoCast);
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
-                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, length, canCastY);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto yOffset = shape::indexOffset(i + threadOffset, yShapeInfo, yShapeInfoCast, canCastY);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], y[yOffset], i, length, rng, extraArguments);
                     }
                 }
@@ -185,18 +185,18 @@ namespace functions {
 
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             nd4j::OmpLaunchHelper info(length);
-            
+
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);                        
+                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpClass::op(x[offset], i, length, rng, extraArguments);
                     }
                 }
@@ -207,15 +207,15 @@ namespace functions {
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-                {                
+                {
                     auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (Nd4jLong i = 0; i < ulen; i++)  {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, length, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpClass::op(x[xOffset], i, length, rng, extraArguments);
                     }
                 }
@@ -231,7 +231,7 @@ namespace functions {
             auto extraArguments = reinterpret_cast<X *>(vextraArguments);
 
             auto length = shape::length(zShapeInfo);
-          
+
             //nd4j::random::RandomBuffer *buffer = reinterpret_cast<nd4j::random::RandomBuffer *> (state);
             nd4j::graph::RandomGenerator* rng = reinterpret_cast<nd4j::graph::RandomGenerator*>(state);
             nd4j::OmpLaunchHelper info(length);
@@ -240,14 +240,14 @@ namespace functions {
             const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo(zShapeInfo, zShapeInfoCast);
 
             PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
-            {                
+            {
                 auto threadNum = omp_get_thread_num();
                 auto threadOffset = info.getThreadOffset(threadNum);
                 auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                 PRAGMA_OMP_SIMD
                 for (Nd4jLong i = 0; i < ulen; i++)  {
-                    auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, length, canCastZ);
+                    auto offset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                     z[offset] = OpClass::op(i+threadOffset, length, rng, extraArguments);
                 }
             }
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
index a7145846e..246d18ac4 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_bool.cpp
@@ -77,7 +77,7 @@ namespace functions {
 
                 PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
                 for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
 
                 for (int e = 0; e < maxThreads; e++)
@@ -112,7 +112,7 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_SIMD
                     for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     for (int e = 0; e < omp_get_max_threads(); e++)
                         start = OpType::update(start, intermediate[e], extraParams);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
index 8d04b7cdb..a94a19b25 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_float.cpp
@@ -81,7 +81,7 @@ namespace functions {
 
                 PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
                 for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
 
                 for (int e = 0; e < maxThreads; e++)
@@ -115,7 +115,7 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_SIMD
                     for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     for (int e = 0; e < omp_get_max_threads(); e++)
                         start = OpType::update(start, intermediate[e], extraParams);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
index 9069f4198..1a148805e 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_long.cpp
@@ -77,7 +77,7 @@ namespace functions {
 
                 PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
                 for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
 
                 for (int e = 0; e < maxThreads; e++)
@@ -113,7 +113,7 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_SIMD
                     for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     for (int e = 0; e < omp_get_max_threads(); e++)
                         start = OpType::update(start, intermediate[e], extraParams);
diff --git a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
index 676348017..0dfff5e73 100644
--- a/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
+++ b/libnd4j/include/loops/cpu/reduce/reduce_same.cpp
@@ -79,7 +79,7 @@ namespace functions {
 
                 PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
                 for(Nd4jLong i = 0; i < length; ++i)
-                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                    intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
 
                 for (int e = 0; e < maxThreads; e++)
@@ -117,7 +117,7 @@ namespace functions {
 
                     PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(maxThreads)
                     for(Nd4jLong i = 0; i < length; ++i)
-                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX)], extraParams), extraParams);
+                        intermediate[omp_get_thread_num()] = OpType::update(intermediate[omp_get_thread_num()], OpType::op(x[shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX)], extraParams), extraParams);
 
                     for (int e = 0; e < maxThreads; e++)
                         start = OpType::update(start, intermediate[e], extraParams);
diff --git a/libnd4j/include/loops/cpu/reduce3.cpp b/libnd4j/include/loops/cpu/reduce3.cpp
index eeea227c8..fd09dc0e1 100644
--- a/libnd4j/include/loops/cpu/reduce3.cpp
+++ b/libnd4j/include/loops/cpu/reduce3.cpp
@@ -95,7 +95,7 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
         for(unsigned int i = 0; i < length; i++) {
             const auto threadNum = omp_get_thread_num();
-            auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX);
+            auto offset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
             intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[offset], y[offset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
         }
     } else {
@@ -105,8 +105,8 @@ void Reduce3<X,Z>::execScalar(void *vx, Nd4jLong *xShapeInfo,
         PRAGMA_OMP_PARALLEL_FOR_SIMD_THREADS(t._numThreads)
         for(unsigned int i = 0; i < length; i++) {
             const auto threadNum = omp_get_thread_num();
-            auto xOffset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCastX);
-            auto yOffset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, length, canCastY);
+            auto xOffset  = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCastX);
+            auto yOffset  = shape::indexOffset(i, yShapeInfo, yShapeInfoCast, canCastY);
             intermediate[threadNum] = OpType::update(intermediate[threadNum], OpType::op(x[xOffset], y[yOffset], extraParamsLocal + 3 * threadNum), extraParamsLocal + 3 * threadNum);
         }
     }
diff --git a/libnd4j/include/loops/cpu/scalar.hpp b/libnd4j/include/loops/cpu/scalar.hpp
index 8f9fd0990..79e53e4a2 100644
--- a/libnd4j/include/loops/cpu/scalar.hpp
+++ b/libnd4j/include/loops/cpu/scalar.hpp
@@ -33,14 +33,14 @@ namespace scalar    {
 ////////////////////////////////////////////////////////////////////////
 template<typename X, typename Y, typename Z>
 template<typename OpType>
-void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo, 
-                                                void *vextraParams, 
-                                                void *vz, Nd4jLong *zShapeInfo, 
-                                                void *vscalars, 
-                                                int *dimension, int dimensionLength, 
+void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
+                                                void *vextraParams,
+                                                void *vz, Nd4jLong *zShapeInfo,
+                                                void *vscalars,
+                                                int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
                                                 Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
-        
+
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
     auto scalars = reinterpret_cast<Y *>(vscalars);
@@ -159,37 +159,37 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
 
             PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
             {
-                auto threadNum = omp_get_thread_num();                    
+                auto threadNum = omp_get_thread_num();
                 auto threadOffset = info.getThreadOffset(threadNum);
                 auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                 PRAGMA_OMP_SIMD
                 for (unsigned int i = 0; i < ulen; i++) {
-                    auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);                    
+                    auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                     z[offset] = OpType::op(x[offset], scalar, extraParams);
                 }
             }
         }
         else {
-            
+
             uint zShapeInfoCast[MAX_RANK];
             const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
             PRAGMA_OMP_PARALLEL_THREADS_IF(info._numThreads, allowParallelism)
             {
-                auto threadNum = omp_get_thread_num();                    
+                auto threadNum = omp_get_thread_num();
                 auto threadOffset = info.getThreadOffset(threadNum);
                 auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                 PRAGMA_OMP_SIMD
                 for (unsigned int i = 0; i < ulen; i++) {
-                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
-                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
+                    auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                    auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                     z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
                 }
             }
-        }  
-    }                        
+        }
+    }
 }
 
 ////////////////////////////////////////////////////////////////////////
@@ -200,7 +200,7 @@ void ScalarTransform<X, Y, Z>::transform(void *vx, Nd4jLong xEws,
                                         void *vscalar,
                                         void *vextraParams,
                                         const Nd4jLong len, bool allowParallelism) {
-               
+
     auto x = reinterpret_cast<X *>(vx);
     auto z = reinterpret_cast<Z *>(vz);
     auto scalar = reinterpret_cast<Y *>(vscalar)[0];
diff --git a/libnd4j/include/loops/cpu/scalar_bool.cpp b/libnd4j/include/loops/cpu/scalar_bool.cpp
index 1f400119b..b37bdd6ef 100644
--- a/libnd4j/include/loops/cpu/scalar_bool.cpp
+++ b/libnd4j/include/loops/cpu/scalar_bool.cpp
@@ -33,14 +33,14 @@ namespace functions {
 
         template<typename X, typename Z>
         template<typename OpType>
-        void ScalarBoolTransform<X, Z>::transform(void *vx, Nd4jLong *xShapeInfo, 
-                                                void *vextraParams, 
-                                                void *vz,  Nd4jLong *zShapeInfo, 
-                                                void *vscalars, 
-                                                int *dimension, int dimensionLength, 
+        void ScalarBoolTransform<X, Z>::transform(void *vx, Nd4jLong *xShapeInfo,
+                                                void *vextraParams,
+                                                void *vz,  Nd4jLong *zShapeInfo,
+                                                void *vscalars,
+                                                int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
                                                 Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
             auto scalars = reinterpret_cast<X *>(vscalars);
@@ -63,7 +63,7 @@ namespace functions {
                 printf("ScalarBoolTransform<X, Z>::transform: super-bad loop visited. Shouldn't ever happen\n");
                 return;
             }
-            
+
             int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
 
             if (kindOfLoop == nd4j::LoopKind::EWS1) {
@@ -76,7 +76,7 @@ namespace functions {
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
                 }
-            } 
+            }
             else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO
                 PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
                 for (unsigned int r = 0; r < numTads; r++) {
@@ -87,7 +87,7 @@ namespace functions {
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
                 }
-            }          
+            }
         }
 
         template<typename X, typename Y>
@@ -139,7 +139,7 @@ namespace functions {
                                Nd4jLong *zShapeInfo,
                                void *vscalar,
                                void *vextraParams) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<Z *>(vz);
             auto scalar = reinterpret_cast<X *>(vscalar)[0];
@@ -162,41 +162,41 @@ namespace functions {
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
             nd4j::OmpLaunchHelper info(len);
-                               
+
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
                 {
-                    auto threadNum = omp_get_thread_num();                    
+                    auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int i = 0; i < ulen; i++) {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
+                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpType::op(x[offset], scalar, extraParams);
                     }
                 }
             }
             else {
-                
+
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
                 {
-                    auto threadNum = omp_get_thread_num();                    
+                    auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int i = 0; i < ulen; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
                     }
                 }
-            }          
+            }
         }
 
 
@@ -213,7 +213,7 @@ namespace functions {
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<Z *>(vz);
                 auto scalar = reinterpret_cast<X *>(vscalar)[0];
-                auto extraParams = reinterpret_cast<X *>(vextraParams); 
+                auto extraParams = reinterpret_cast<X *>(vextraParams);
 
                 nd4j::OmpLaunchHelper info(len);
 
@@ -231,7 +231,7 @@ namespace functions {
                         for (unsigned int i = 0; i < ulen; i++)
                             zi[i] = OpType::op(xi[i], scalar, extraParams);
                     }
-                } 
+                }
                 else {
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
diff --git a/libnd4j/include/loops/cpu/scalar_int.cpp b/libnd4j/include/loops/cpu/scalar_int.cpp
index 9920cc836..9e73e2756 100644
--- a/libnd4j/include/loops/cpu/scalar_int.cpp
+++ b/libnd4j/include/loops/cpu/scalar_int.cpp
@@ -34,13 +34,13 @@ namespace functions {
         template<typename X>
         template<typename OpType>
         void ScalarIntTransform<X>::transform(void *vx, Nd4jLong *xShapeInfo,
-                                                void *vextraParams, 
-                                                void *vz,  Nd4jLong *zShapeInfo, 
-                                                void *vscalars, 
-                                                int *dimension, int dimensionLength, 
+                                                void *vextraParams,
+                                                void *vz,  Nd4jLong *zShapeInfo,
+                                                void *vscalars,
+                                                int *dimension, int dimensionLength,
                                                 Nd4jLong *xTadShapeInfo, Nd4jLong *xTadOffsets,
                                                 Nd4jLong *zTadShapeInfo, Nd4jLong *zTadOffsets) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
             auto scalars = reinterpret_cast<X *>(vscalars);
@@ -63,7 +63,7 @@ namespace functions {
                 printf("ScalarIntTransform<X>::transform: super-bad loop visited. Shouldn't ever happen\n");
                 return;
             }
-            
+
             int num_threads = nd4j::math::nd4j_min<int>(numTads, omp_get_max_threads());
 
             if (kindOfLoop == nd4j::LoopKind::EWS1) {
@@ -76,7 +76,7 @@ namespace functions {
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f] = OpType::op(oX[f], scalars[r], extraParams);
                 }
-            } 
+            }
             else { // kindOfLoop != nd4j::LoopKind::EWSNONZERO
                 PRAGMA_OMP_PARALLEL_FOR_THREADS(num_threads)
                 for (unsigned int r = 0; r < numTads; r++) {
@@ -87,7 +87,7 @@ namespace functions {
                     for (unsigned int f = 0; f < tadLength; f++)
                         oZ[f * zTadEws] = OpType::op(oX[f * xTadEws], scalars[r], extraParams);
                 }
-            }          
+            }
         }
 
         template<typename X>
@@ -139,7 +139,7 @@ namespace functions {
                                Nd4jLong *zShapeInfo,
                                void *vscalar,
                                void *vextraParams) {
-            
+
             auto x = reinterpret_cast<X *>(vx);
             auto z = reinterpret_cast<X *>(vz);
             auto scalar = reinterpret_cast<X *>(vscalar)[0];
@@ -162,41 +162,41 @@ namespace functions {
             const bool canCastX = nd4j::DataTypeUtils::castShapeInfo<uint>(xShapeInfo, xShapeInfoCast);
 
             nd4j::OmpLaunchHelper info(len);
-                               
+
             if(shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo)) {
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
                 {
-                    auto threadNum = omp_get_thread_num();                    
+                    auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int i = 0; i < ulen; i++) {
-                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
+                        auto offset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
                         z[offset] = OpType::op(x[offset], scalar, extraParams);
                     }
                 }
             }
             else {
-                
+
                 uint zShapeInfoCast[MAX_RANK];
                 const bool canCastZ = nd4j::DataTypeUtils::castShapeInfo<uint>(zShapeInfo, zShapeInfoCast);
 
                 PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
                 {
-                    auto threadNum = omp_get_thread_num();                    
+                    auto threadNum = omp_get_thread_num();
                     auto threadOffset = info.getThreadOffset(threadNum);
                     auto ulen = static_cast<unsigned int>(info.getItersPerThread(threadNum));
 
                     PRAGMA_OMP_SIMD
                     for (unsigned int i = 0; i < ulen; i++) {
-                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, len, canCastX);
-                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, len, canCastZ);
+                        auto xOffset = shape::indexOffset(i + threadOffset, xShapeInfo, xShapeInfoCast, canCastX);
+                        auto zOffset = shape::indexOffset(i + threadOffset, zShapeInfo, zShapeInfoCast, canCastZ);
                         z[zOffset] = OpType::op(x[xOffset], scalar, extraParams);
                     }
                 }
-            }          
+            }
         }
 
 
@@ -213,7 +213,7 @@ namespace functions {
                 auto x = reinterpret_cast<X *>(vx);
                 auto z = reinterpret_cast<X *>(vz);
                 auto scalar = reinterpret_cast<X *>(vscalar)[0];
-                auto extraParams = reinterpret_cast<X *>(vextraParams); 
+                auto extraParams = reinterpret_cast<X *>(vextraParams);
 
                 nd4j::OmpLaunchHelper info(len);
 
@@ -231,7 +231,7 @@ namespace functions {
                         for (unsigned int i = 0; i < ulen; i++)
                             zi[i] = OpType::op(xi[i], scalar, extraParams);
                     }
-                } 
+                }
                 else {
 
                     PRAGMA_OMP_PARALLEL_THREADS(info._numThreads)
diff --git a/libnd4j/include/loops/cpu/summarystatsreduce.cpp b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
index ed398db28..1f5a7c339 100644
--- a/libnd4j/include/loops/cpu/summarystatsreduce.cpp
+++ b/libnd4j/include/loops/cpu/summarystatsreduce.cpp
@@ -92,7 +92,7 @@ namespace functions {
 
             for (Nd4jLong i = 0; i < length; i++) {
 
-                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, length, canCast);
+                auto xOffset = shape::indexOffset(i, xShapeInfo, xShapeInfoCast, canCast);
 
                 SummaryStatsData<X> curr;
                 curr.initWithValue(x[xOffset]);
@@ -175,7 +175,7 @@ namespace functions {
                 }
                 else {
                     for (int i = 1; i < tadLength; i ++) {
-                        auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, tadLength, canCast);
+                        auto xOffset = shape::indexOffset(i, tadShapeShapeInfo, tadShapeShapeInfoCast, canCast);
 
                         SummaryStatsData <X> indexVal2;
                         indexVal2.initWithValue(tx[xOffset]);
diff --git a/libnd4j/include/loops/cuda/broadcasting.chpp b/libnd4j/include/loops/cuda/broadcasting.chpp
index d930d8cad..086e216e6 100644
--- a/libnd4j/include/loops/cuda/broadcasting.chpp
+++ b/libnd4j/include/loops/cuda/broadcasting.chpp
@@ -42,7 +42,7 @@ static __global__ void broadcastSimple(
         Nd4jLong *zShapeInfo,
         int *dimension,
         int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-    
+
     functions::broadcast::Broadcast<X,Y,Z>::template transformCuda<OpClass>(x,xShapeInfo,y,yShapeInfo,z,zShapeInfo,dimension,dimensionLength,tadOnlyShapeInfo,tadOffsets,tadOnlyShapeInfoZ,tadOffsetsZ);
 }
 
@@ -64,8 +64,8 @@ static __global__ void broadcastInverseSimple(
 namespace functions {
     namespace broadcast {
 
-        static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) {
-            return shape::getIndexOffset(index, shapeInfo, length);
+        static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
+            return shape::getIndexOffset(index, shapeInfo);
         }
 
         static Nd4jLong __device__ __noinline__ _length(Nd4jLong *shapeInfo) {
@@ -154,9 +154,9 @@ namespace functions {
                 else {
                     // it is expected that x and z tads and y array all have the same length
                     for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
-                        auto xOffset = _getIndexOffset(i, xShapeInfo,  tadLength);
-                        auto yOffset = _getIndexOffset(i, tadOnlyShapeInfo, tadLength);
-                        auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
+                        auto xOffset = _getIndexOffset(i, xShapeInfo);
+                        auto yOffset = _getIndexOffset(i, tadOnlyShapeInfo);
+                        auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ);
                         rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
                     }
                 }
@@ -170,14 +170,14 @@ namespace functions {
 		                              void *vx, Nd4jLong *xShapeInfo,
 		                              void *vy, Nd4jLong *yShapeInfo,
 		                              void *vz, Nd4jLong *zShapeInfo,
-		                              int *dimension, int dimensionLength, 
+		                              int *dimension, int dimensionLength,
                                       Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
             if (tadOnlyShapeInfoZ == nullptr) {
                 tadOnlyShapeInfoZ = tadOnlyShapeInfo;
                 tadOffsetsZ = tadOffsets;
             }
-            
+
             auto x = reinterpret_cast<X*>(vx);
             auto y = reinterpret_cast<Y*>(vy);
             auto z = reinterpret_cast<Z*>(vz);
@@ -212,16 +212,16 @@ namespace functions {
 
 
                 if(tadEWS > 0 && zEWS > 0 && yEWS > 0 && xOrder == yOrder && xOrder == zOrder) {
-                    for (int i = threadIdx.x; i < tadLength; i+= blockDim.x) 
-                        rZ[i * zEWS] = OpType::op(rX[i * tadEWS], y[i * yEWS]); 
+                    for (int i = threadIdx.x; i < tadLength; i+= blockDim.x)
+                        rZ[i * zEWS] = OpType::op(rX[i * tadEWS], y[i * yEWS]);
                 }
                 else {
                     // it is expected that x and z tads and y array all have the same length
                     for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
-                        
-                        auto xOffset = _getIndexOffset(i, tadOnlyShapeInfo,  tadLength);
-                        auto yOffset = _getIndexOffset(i, yShapeInfo, tadLength);
-                        auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
+
+                        auto xOffset = _getIndexOffset(i, tadOnlyShapeInfo);
+                        auto yOffset = _getIndexOffset(i, yShapeInfo);
+                        auto zOffset = _getIndexOffset(i, tadOnlyShapeInfoZ);
                         rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
                     }
                 }
diff --git a/libnd4j/include/loops/cuda/broadcasting_bool.cu b/libnd4j/include/loops/cuda/broadcasting_bool.cu
index 8981790f5..aaec44690 100644
--- a/libnd4j/include/loops/cuda/broadcasting_bool.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_bool.cu
@@ -42,7 +42,7 @@ static __global__ void broadcastBoolSimple(
         Nd4jLong *zShapeInfo,
         int *dimension,
         int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-    
+
     functions::broadcast::BroadcastBool<X, Z>::template transformCuda<OpClass>(x,xShapeInfo,y,yShapeInfo,z,zShapeInfo,dimension,dimensionLength,tadOnlyShapeInfo,tadOffsets,tadOnlyShapeInfoZ,tadOffsetsZ);
 }
 
@@ -145,9 +145,9 @@ namespace functions {
                 else {
                     // it is expected that x and z tads and y array all have the same length
                     for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
-                        auto xOffset = shape::getIndexOffset(i, xShapeInfo,  tadLength);
-                        auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
-                        auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
+                        auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+                        auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
+                        auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
 
                         rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
                     }
@@ -183,13 +183,13 @@ namespace functions {
             __shared__ int numTads;
             __shared__ Nd4jLong yEWS;
             __shared__ Nd4jLong zEWS;
-      
+
             if (threadIdx.x == 0) {
    	            tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
                 tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                 numTads = shape::length(xShapeInfo) / tadLength;
                 yEWS = shape::elementWiseStride(yShapeInfo);
-                zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ);    
+                zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ);
             }
             __syncthreads();
 
@@ -213,9 +213,9 @@ namespace functions {
             else {
                 // it is expected that x and z tads and y array all have the same length
                 for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
-                    auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo,  tadLength);
-                    auto yOffset = shape::getIndexOffset(i, yShapeInfo, tadLength);
-                    auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
+                    auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
+                    auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+                    auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
 
                     rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
                 }
diff --git a/libnd4j/include/loops/cuda/broadcasting_int.cu b/libnd4j/include/loops/cuda/broadcasting_int.cu
index 38193f35d..fc613a438 100644
--- a/libnd4j/include/loops/cuda/broadcasting_int.cu
+++ b/libnd4j/include/loops/cuda/broadcasting_int.cu
@@ -42,7 +42,7 @@ static __global__ void broadcastIntSimple(
         Nd4jLong *zShapeInfo,
         int *dimension,
         int dimensionLength, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets, Nd4jLong *tadOnlyShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-    
+
     functions::broadcast::BroadcastInt<X>::template transformCuda<OpClass>(x,xShapeInfo,y,yShapeInfo,z,zShapeInfo,dimension,dimensionLength,tadOnlyShapeInfo,tadOffsets,tadOnlyShapeInfoZ,tadOffsetsZ);
 }
 
@@ -139,9 +139,9 @@ namespace functions {
                 else {
                     // it is expected that x and z tads and y array all have the same length
                     for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
-                        auto xOffset = shape::getIndexOffset(i, xShapeInfo,  tadLength);
-                        auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
-                        auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
+                        auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+                        auto yOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
+                        auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
 
                         rZ[zOffset] = OpType::op(x[xOffset], rY[yOffset]);
                     }
@@ -177,13 +177,13 @@ namespace functions {
             __shared__ int numTads;
             __shared__ Nd4jLong yEWS;
             __shared__ Nd4jLong zEWS;
-      
+
             if (threadIdx.x == 0) {
    	            tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
                 tadEWS = shape::elementWiseStride(tadOnlyShapeInfo);
                 numTads = shape::length(xShapeInfo) / tadLength;
                 yEWS = shape::elementWiseStride(yShapeInfo);
-                zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ);    
+                zEWS = shape::elementWiseStride(tadOnlyShapeInfoZ);
             }
             __syncthreads();
 
@@ -207,9 +207,9 @@ namespace functions {
             else {
                 // it is expected that x and z tads and y array all have the same length
                 for (Nd4jLong i = threadIdx.x; i < tadLength; i+= blockDim.x) {
-                    auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo,  tadLength);
-                    auto yOffset = shape::getIndexOffset(i, yShapeInfo, tadLength);
-                    auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ, tadLength);
+                    auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo);
+                    auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+                    auto zOffset = shape::getIndexOffset(i, tadOnlyShapeInfoZ);
 
                     rZ[zOffset] = OpType::op(rX[xOffset], y[yOffset]);
                 }
diff --git a/libnd4j/include/loops/cuda/indexreduce.cu b/libnd4j/include/loops/cuda/indexreduce.cu
index 5f0cf07ae..8a560e416 100644
--- a/libnd4j/include/loops/cuda/indexreduce.cu
+++ b/libnd4j/include/loops/cuda/indexreduce.cu
@@ -246,12 +246,12 @@ namespace functions {
                 if (dimensionLength > 1 || tadEWS < 1) {
 
                     for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
-                        
+
                         auto tadOffsetForBlock = tadOffsets[r];
                         sPartials[threadIdx.x] = OpType::startingIndexValue(dx);
 
-                        for(int i = threadIdx.x;i < tadLength; i += blockDim.x) {                            
-                            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+                        for(int i = threadIdx.x;i < tadLength; i += blockDim.x) {
+                            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
                             IndexValue<X> comp {dx[xOffset], i};
                             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], comp, extraParams);
                         }
@@ -297,9 +297,9 @@ namespace functions {
                         reduction = OpType::update(reduction, indexVal, extraParams);
                     }
                 } else {
-                                        
-                    for(Nd4jLong i = tid;i < n; i += blockDim.x * gridDim.x) {                                                
-                        auto offset = shape::getIndexOffset(i, xShapeInfo, n);
+
+                    for(Nd4jLong i = tid;i < n; i += blockDim.x * gridDim.x) {
+                        auto offset = shape::getIndexOffset(i, xShapeInfo);
                         IndexValue<X> indexVal = {dx[offset], i};
                         reduction = OpType::update(reduction, indexVal, extraParams);
                     }
diff --git a/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h
index 3c79f443b..5df583e61 100644
--- a/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h
+++ b/libnd4j/include/loops/cuda/inplace_loops/reduce_same_inplace.h
@@ -115,7 +115,7 @@ namespace functions {
                     sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
             else
                 for (int i = tid; i < len; i += blockDim.x * gridDim.x)
-                    sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
+                    sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
             __syncthreads();
             aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
diff --git a/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h
index cb87ea461..9e061003d 100644
--- a/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h
+++ b/libnd4j/include/loops/cuda/inplace_loops/scalar_inplace.h
@@ -73,7 +73,7 @@ namespace functions {
 
 
             for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-                z[shape::getIndexOffset(i, zShapeInfo, length)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, length)], scalar, params);
+                z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params);
             }
         }
     }
diff --git a/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h b/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h
index e3f653350..b10b23d09 100644
--- a/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h
+++ b/libnd4j/include/loops/cuda/inplace_loops/transform_strict_inplace.h
@@ -72,8 +72,8 @@ namespace functions {
 
 
             for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
-                auto xOffset2 = shape::getIndexOffset(i, shapeInfo,  length);
-                auto zOffset2 = shape::getIndexOffset(i, zShapeInfo, length);
+                auto xOffset2 = shape::getIndexOffset(i, shapeInfo);
+                auto zOffset2 = shape::getIndexOffset(i, zShapeInfo);
                 result[zOffset2] = OpType::op(dy[xOffset2], params);
             }
         }
diff --git a/libnd4j/include/loops/cuda/legacy/reduce.legacy b/libnd4j/include/loops/cuda/legacy/reduce.legacy
index 7b365f9fe..1ae7985de 100644
--- a/libnd4j/include/loops/cuda/legacy/reduce.legacy
+++ b/libnd4j/include/loops/cuda/legacy/reduce.legacy
@@ -169,7 +169,7 @@ namespace functions {
 
             template <>
             _CUDA_H void ReduceFunction<float>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, float *x, Nd4jLong *xShapeInfo, float *extraParams, float *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, float *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
-                
+
                 DISPATCH_SIMPLE(reduceScalarSimple, float, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS))
 
 				nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarFloat(...) failed");
@@ -177,7 +177,7 @@ namespace functions {
 
             template <>
             _CUDA_H void ReduceFunction<float16>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, float16 *x, Nd4jLong *xShapeInfo, float16 *extraParams, float16 *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, float16 *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
-                
+
                 DISPATCH_SIMPLE(reduceScalarSimple, float16, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS))
 
 				nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarHalf(...) failed");
@@ -185,7 +185,7 @@ namespace functions {
 
             template <>
             _CUDA_H void ReduceFunction<double>::execReduceScalar(dim3 launchDims, cudaStream_t *stream, int opNum, double *x, Nd4jLong *xShapeInfo, double *extraParams, double *z, Nd4jLong *zShapeInfo, int *dimension, int dimensionLength, double *reductionBuffer, Nd4jLong *tadOnlyShapeInfo) {
-                
+
                 DISPATCH_SIMPLE(reduceScalarSimple, double, PARAMS(x, xShapeInfo, extraParams, z, zShapeInfo, nullptr, 1, reductionBuffer, tadOnlyShapeInfo), OPS_A(REDUCE_OPS))
 
 				nd4j::DebugHelper::checkErrorCode(stream, "execReduceScalarDouble(...) failed");
@@ -294,7 +294,7 @@ namespace functions {
 
                         for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
 						    shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
-						    auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank);
+						    auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord);
 
 						    sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
 					    }
@@ -358,7 +358,7 @@ namespace functions {
 					for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
 						shape::ind2subC(rank, xShape, i, n, ind2sub);
 
-						auto offset = shape::getOffset(0, xShape, xStride, ind2sub, rank);
+						auto offset = shape::getOffset(xShapeInfo, ind2sub);
 						sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[offset], extraParams), extraParams);
 					}
 				}
@@ -461,7 +461,7 @@ namespace functions {
 
 					for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
 						shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
-						auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank);
+						auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord);
 
 						sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
 					}
@@ -526,7 +526,7 @@ namespace functions {
 
 					for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
 						shape::ind2subC(tadRank, tadShape, i, tadLength, xCoord);
-						auto xOffset = shape::getOffset(tadOffsetForBlock, tadShape, tadStride, xCoord, tadRank);
+						auto xOffset = shape::getOffset(tadOnlyShapeInfo, xCoord);
 
 						sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(dx[xOffset], extraParams), extraParams);
 					}
diff --git a/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy b/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy
index 73cb9c6ff..7bc30271f 100644
--- a/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy
+++ b/libnd4j/include/loops/cuda/legacy/scalar_temp.legacy
@@ -88,8 +88,8 @@ static inline __device__ void transformCuda(T scalar, T *dy, int *shapeInfo, T *
 
         for (Nd4jLong i = tid; i < length; i+= totalThreads) {
             shape::ind2sub(xRank, xShape, i, length, xIdx);
-            int xOffset2 = shape::getOffset(0, xShape, xStride, xIdx, xRank);
-            int resultOffset = shape::getOffset(0, zShape, zStride, xIdx, zRank);
+            int xOffset2 = shape::getOffset(shapeInfo, xIdx);
+            int resultOffset = shape::getOffset(0resultShapeInfo, xIdx);
             result[resultOffset] = OpType::op(dy[xOffset2],scalar, params);
         }
     }
diff --git a/libnd4j/include/loops/cuda/legacy/transform.legacy b/libnd4j/include/loops/cuda/legacy/transform.legacy
index ed321e79c..6a8344916 100644
--- a/libnd4j/include/loops/cuda/legacy/transform.legacy
+++ b/libnd4j/include/loops/cuda/legacy/transform.legacy
@@ -111,7 +111,7 @@ __device__ void transformSimpleGeneric(
 		manager->init(sizeof(UnifiedSharedMemory), 0, sizeof(functions::transform::Transform<T>), sizeof(shape::TAD), xRank);
 	}
 	__syncthreads();
-	
+
     functions::transform::Transform<T>::template transformCuda<OpClass>(
 	    dy,
 	    xShapeInfo,
@@ -161,7 +161,7 @@ namespace functions {
 
         template <>
         _CUDA_H void Transform<float>::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, float *x, Nd4jLong *xShape, int xRank, float *extraParams, float *z, Nd4jLong *zShape, int zRank, int *allocationPointer, float *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-            
+
             DISPATCH_SIMPLE(transformShaped, float, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS))
 
 
@@ -170,16 +170,16 @@ namespace functions {
 
         template <>
         _CUDA_H void Transform<float16>::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, float16 *x, Nd4jLong *xShape, int xRank, float16 *extraParams, float16 *z, Nd4jLong *zShape, int zRank, int *allocationPointer, float16 *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-            
+
             DISPATCH_SIMPLE(transformShaped, float16, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS))
-            
+
             if (nd4j::Environment::getInstance()->isDebug())
 		        checkCudaErrors(cudaStreamSynchronize(*stream));
         }
 
         template <>
         _CUDA_H void Transform<double>::executeTransformShaped(dim3 launchDims, cudaStream_t *stream, int opNum, double *x, Nd4jLong *xShape, int xRank, double *extraParams, double *z, Nd4jLong *zShape, int zRank, int *allocationPointer, double *reductionPointer,  Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-            
+
             DISPATCH_SIMPLE(transformShaped, double, PARAMS(x, xShape, xRank, extraParams, z, zShape, zRank, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets), OPS_A(TRANSFORM_OPS))
 
             DEBUG_KERNEL(stream, opNum);
@@ -226,13 +226,13 @@ namespace functions {
 		        }
 		        else {
 			        Nd4jLong xCoord[MAX_RANK];
-			
+
 		    	    for (Nd4jLong i = tid; i < length; i+= gridDim.x * blockDim.x) {
 						shape::ind2sub(xRank,shape::shapeOf(shapeInfo),i, length, xCoord);
-						
-				        auto xOffset2 = shape::getOffset(0, xShape, xStride, xCoord, xRank);
-						auto resultOffset2 = shape::getOffset(0,xShape,shape::stride(resultShapeInfo),xCoord,xRank);
-						
+
+				        auto xOffset2 = shape::getOffset(shapeInfo, xCoord);
+						auto resultOffset2 = shape::getOffset(resultShapeInfo, xCoord);
+
 	    			    result[resultOffset2] = OpType::op(dy[xOffset2], params);
 		    	    }
 		        }
@@ -249,7 +249,7 @@ namespace functions {
 			T *result,
 			Nd4jLong resultStride,
 			int *allocationPointer, T *reductionPointer, UnifiedSharedMemory *manager) {
-		
+
             int totalThreads = gridDim.x * blockDim.x;
 		    Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x;
 
diff --git a/libnd4j/include/loops/cuda/pairwise.chpp b/libnd4j/include/loops/cuda/pairwise.chpp
index 3f7134887..d3252d862 100644
--- a/libnd4j/include/loops/cuda/pairwise.chpp
+++ b/libnd4j/include/loops/cuda/pairwise.chpp
@@ -28,11 +28,11 @@ using namespace simdOps;
 
 ////////////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y, typename Z, typename OpType>
-__global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo, 
-											void *vy, Nd4jLong *yShapeInfo, 
-											void *vz, Nd4jLong *zShapeInfo, 
+__global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
+											void *vy, Nd4jLong *yShapeInfo,
+											void *vz, Nd4jLong *zShapeInfo,
 											void *vextraParams) {
-	
+
 	auto x = reinterpret_cast<X*>(vx);
 	auto y = reinterpret_cast<Y*>(vy);
 	auto z = reinterpret_cast<Z*>(vz);
@@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	}
 	else if (vx == vz) {
 		for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
-			auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-			auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
-				
+			auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+			auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+
 			z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
 		}
 	}
 	else {
 		for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
-			auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-			auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
-			auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
+			auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+			auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+			auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 
 			z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
 		}
diff --git a/libnd4j/include/loops/cuda/pairwise_bool.cu b/libnd4j/include/loops/cuda/pairwise_bool.cu
index 62f040191..414aadd30 100644
--- a/libnd4j/include/loops/cuda/pairwise_bool.cu
+++ b/libnd4j/include/loops/cuda/pairwise_bool.cu
@@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	}
 	else if (vx == vz) {
 		for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
-			auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-			auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
-				
+			auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+			auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+
 			z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
 		}
 	}
 	else {
 		for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
-			auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-			auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
-			auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
+			auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+			auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+			auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 
 			z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
 		}
@@ -105,7 +105,7 @@ void _CUDA_H PairWiseBoolTransform<X,Z>::intermediateShaped(dim3& launchDims, cu
 template<typename X, typename Y>
 void PairWiseBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, void *vy, Nd4jLong *yShapeInfo, void *vz, Nd4jLong *zShapeInfo, void *vextraParams) {
     auto xType = nd4j::DataTypeUtils::fromT<X>();
-    auto yType = nd4j::DataTypeUtils::fromT<Y>();    
+    auto yType = nd4j::DataTypeUtils::fromT<Y>();
 
 	DISPATCH_BY_OPNUM_TT(intermediateShaped, PARAMS(launchDims, stream, vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, vextraParams), PAIRWISE_BOOL_OPS);
 }
@@ -166,7 +166,7 @@ void PairWiseBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_
     }
 
 
-      
+
     BUILD_DOUBLE_TEMPLATE(template class ND4J_EXPORT PairWiseBoolTransform, , LIBND4J_TYPES, BOOL_TYPES);
 }
 }
diff --git a/libnd4j/include/loops/cuda/pairwise_int.cu b/libnd4j/include/loops/cuda/pairwise_int.cu
index 5cc12846c..2bedb4a82 100644
--- a/libnd4j/include/loops/cuda/pairwise_int.cu
+++ b/libnd4j/include/loops/cuda/pairwise_int.cu
@@ -67,17 +67,17 @@ __global__ static void pairwiseSimpleShaped(void* vx, Nd4jLong *xShapeInfo,
 	}
 	else if (vx == vz) {
 		for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
-			auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-			auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
-				
+			auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+			auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+
 			z[xOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
 		}
 	}
 	else {
 		for (Nd4jLong i = tid; i < len; i += gridDim.x * blockDim.x) {
-			auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-			auto yOffset = shape::getIndexOffset(i, yShapeInfo, len);
-			auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
+			auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+			auto yOffset = shape::getIndexOffset(i, yShapeInfo);
+			auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 
 			z[zOffset] = OpType::op(x[xOffset], y[yOffset], extraParams);
 		}
@@ -165,7 +165,7 @@ void PairWiseIntTransform<X>::executeCudaShaped(dim3& launchDims, cudaStream_t *
     }
 
 
-      
+
     BUILD_SINGLE_TEMPLATE(template class ND4J_EXPORT PairWiseIntTransform, , INTEGER_TYPES);
 }
 }
diff --git a/libnd4j/include/loops/cuda/random.cu b/libnd4j/include/loops/cuda/random.cu
index 727f0868f..3bf06ae91 100644
--- a/libnd4j/include/loops/cuda/random.cu
+++ b/libnd4j/include/loops/cuda/random.cu
@@ -116,7 +116,7 @@ namespace functions {
                 auto y = reinterpret_cast<T*>(vy);
                 auto z = reinterpret_cast<T*>(vz);
                 auto extraArguments = reinterpret_cast<T*>(vextraArguments);
-                
+
                 if (OpClass::requiresSpecial) {
                     OpClass::specialOpCuda(state, x, xShapeBuffer, y, yShapeBuffer, z, zShapeBuffer, extraArguments);
                     return;
@@ -166,10 +166,10 @@ namespace functions {
                     }
                 } else {
                     for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {
-                        
-                        auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer, length);
-                        auto yOffset2 = shape::getIndexOffset(i, yShapeBuffer, length);
-                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length);                        
+
+                        auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer);
+                        auto yOffset2 = shape::getIndexOffset(i, yShapeBuffer);
+                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
 
                             z[zOffset2] = OpClass::op(x[xOffset2], y[yOffset2], i, length, buffer, extraArguments);
                         }
@@ -224,11 +224,11 @@ namespace functions {
                         z[e * zEWS] = OpClass::op(x[e * xEWS], e, length, buffer, extraArguments);
                     }
                 } else {
-                    
+
                     for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < length; i += blockDim.x * gridDim.x) {
-                        
-                        auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer, length);
-                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length);
+
+                        auto xOffset2 = shape::getIndexOffset(i, xShapeBuffer);
+                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
 
                         z[zOffset2] = OpClass::op(x[xOffset2], i, length, buffer, extraArguments);
                     }
@@ -274,9 +274,9 @@ namespace functions {
                         z[i * ews] = OpClass::op(i, length, buffer, extraArguments);
                     }
                 } else {
-                    
-                    for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {                        
-                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, length);
+
+                    for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x) {
+                        auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
                         z[zOffset2] = OpClass::op(i, length, buffer, extraArguments);
                     }
                 }
@@ -296,7 +296,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<float16>::executeCudaSingle(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
             auto z = reinterpret_cast<float16*>(vz);
             auto extraArguments = reinterpret_cast<float16*>(vextraArguments);
 
@@ -320,7 +320,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<double>::executeCudaSingle(dim3& launchDims, cudaStream_t *stream, int opNum, Nd4jPointer stateHost, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
             auto z = reinterpret_cast<double*>(vz);
             auto extraArguments = reinterpret_cast<double*>(vextraArguments);
 
@@ -332,7 +332,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<float>::executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
             auto x = reinterpret_cast<float*>(vx);
             auto z = reinterpret_cast<float*>(vz);
             auto extraArguments = reinterpret_cast<float*>(vextraArguments);
@@ -346,7 +346,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<float16>::executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
             auto x = reinterpret_cast<float16*>(vx);
             auto z = reinterpret_cast<float16*>(vz);
             auto extraArguments = reinterpret_cast<float16*>(vextraArguments);
@@ -372,7 +372,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<double>::executeCudaDouble(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
             auto x = reinterpret_cast<double*>(vx);
             auto z = reinterpret_cast<double*>(vz);
             auto extraArguments = reinterpret_cast<double*>(vextraArguments);
@@ -385,7 +385,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<float>::executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vy, Nd4jLong *yShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
 
             auto x = reinterpret_cast<float*>(vx);
             auto y = reinterpret_cast<float*>(vy);
@@ -400,7 +400,7 @@ namespace functions {
 
         template <>
         _CUDA_H void RandomFunction<float16>::executeCudaTriple(dim3& launchDims, cudaStream_t* stream, int opNum, Nd4jPointer stateHost, void *vx, Nd4jLong *xShapeBuffer, void *vy, Nd4jLong *yShapeBuffer, void *vz, Nd4jLong *zShapeBuffer, void *vextraArguments) {
-            
+
             auto x = reinterpret_cast<float16*>(vx);
             auto y = reinterpret_cast<float16*>(vy);
             auto z = reinterpret_cast<float16*>(vz);
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
index fea9bcb63..a785094f1 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_bool.cu
@@ -129,7 +129,7 @@ __device__ void ReduceBoolFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
 
         for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
 
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
         }
         __syncthreads();
@@ -140,7 +140,7 @@ __device__ void ReduceBoolFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
     }
 }
 
@@ -180,7 +180,7 @@ __device__ void ReduceBoolFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
     else
         for (int i = tid; i < len; i += blockDim.x * gridDim.x)
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
     aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
index ffd075715..ef366caf7 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
+++ b/libnd4j/include/loops/cuda/reduce/reduce_float.chpp
@@ -129,7 +129,7 @@ __device__ void ReduceFloatFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *x
         sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
 
         for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
         }
         __syncthreads();
@@ -139,7 +139,7 @@ __device__ void ReduceFloatFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *x
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
     }
 }
 
@@ -179,7 +179,7 @@ __device__ void ReduceFloatFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSh
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
     else
         for (int i = tid; i < len; i += blockDim.x * gridDim.x)
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
     aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_long.cu b/libnd4j/include/loops/cuda/reduce/reduce_long.cu
index b989298fe..79ab25280 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_long.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_long.cu
@@ -150,7 +150,7 @@ __device__ void ReduceLongFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
         sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
 
         for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
         }
         __syncthreads();
@@ -160,7 +160,7 @@ __device__ void ReduceLongFunction<X,Z>::transformCudaXD( void *vx, Nd4jLong *xS
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
     }
 }
 
@@ -200,7 +200,7 @@ __device__ void ReduceLongFunction<X,Z>::execScalarCuda(void *vx, Nd4jLong *xSha
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
     else
         for (int i = tid; i < len; i += blockDim.x * gridDim.x)
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
     aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
diff --git a/libnd4j/include/loops/cuda/reduce/reduce_same.cu b/libnd4j/include/loops/cuda/reduce/reduce_same.cu
index 1c367653c..bcf5bab7f 100644
--- a/libnd4j/include/loops/cuda/reduce/reduce_same.cu
+++ b/libnd4j/include/loops/cuda/reduce/reduce_same.cu
@@ -139,7 +139,7 @@ __device__ void ReduceSameFunction<X>::transformCudaXD( void *vx, Nd4jLong *xSha
         sPartials[threadIdx.x] = OpType::startingValue(x + tadOffsetForBlock);
 
         for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[xOffset], extraParams), extraParams);
         }
         __syncthreads();
@@ -149,7 +149,7 @@ __device__ void ReduceSameFunction<X>::transformCudaXD( void *vx, Nd4jLong *xSha
         __syncthreads();
 
         if (threadIdx.x == 0)
-            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo, numTads)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
+            z[isPlainOutput ? r : shape::getIndexOffset(r, zShapeInfo)] = OpType::postProcess(sPartials[threadIdx.x], tadLength, extraParams);
     }
 }
 
@@ -197,7 +197,7 @@ __device__ void ReduceSameFunction<X>::execScalarCuda(void *vx, Nd4jLong *xShape
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[i * xEws], extraParams), extraParams);
     else
         for (int i = tid; i < len; i += blockDim.x * gridDim.x)
-            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo, len)], extraParams), extraParams);
+            sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], extraParams), extraParams);
 
     __syncthreads();
     aggregatePartials<OpType>(sPartials, threadIdx.x, nd4j::math::nd4j_min<int>(blockDim.x, len), extraParams);
diff --git a/libnd4j/include/loops/cuda/reduce3.chpp b/libnd4j/include/loops/cuda/reduce3.chpp
index 01b595da1..fa1ab2e17 100644
--- a/libnd4j/include/loops/cuda/reduce3.chpp
+++ b/libnd4j/include/loops/cuda/reduce3.chpp
@@ -161,8 +161,8 @@ __device__ void Reduce3<X,Z>::execScalarCuda( void *vx, Nd4jLong *xShapeInfo,
         sPartials[threadIdx.x] = OpType::startingValue(x);
 		auto threadCount = gridDim.x * blockDim.x;
         for(Nd4jLong i = tid; i < length; i += threadCount) {
-            auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
-            auto yOffset = shape::getIndexOffset(i, yShapeInfo, length);
+            auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+            auto yOffset = shape::getIndexOffset(i, yShapeInfo);
             sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset], y[yOffset], extraZ), extraZ);
         }
     }
@@ -290,7 +290,7 @@ __device__ void Reduce3<X,Z>::transformAll( void *vx, Nd4jLong *xShapeInfo,
     	X *x = dx + xOffsets[r];
 
         if (threadIdx.x < xTadLength && threadIdx.x < maxBlock) {
-        	auto x0 = shape::getIndexOffset(threadIdx.x, xTadShapeInfo, shape::length(xTadShapeInfo));
+        	auto x0 = shape::getIndexOffset(threadIdx.x, xTadShapeInfo);
             tempX[threadIdx.x] = x[x0];
         }
         __syncthreads();
@@ -311,12 +311,12 @@ __device__ void Reduce3<X,Z>::transformAll( void *vx, Nd4jLong *xShapeInfo,
 				// we reset tempX IF we have >1 tiles
                 if (t >= 1 || (limit > 1 && g > 0))
                 	if (threadIdx.x + (t * maxBlock) < xTadLength) {
-                    	auto x0 = shape::getIndexOffset(threadIdx.x + (t * maxBlock), xTadShapeInfo, xTadLength);
+                    	auto x0 = shape::getIndexOffset(threadIdx.x + (t * maxBlock), xTadShapeInfo);
                     	tempX[threadIdx.x] = x[x0];
                  	}
 
                     for (int f = threadIdx.x + (t * maxBlock); f < xTadLength && f < threadIdx.x + ((t + 1) * maxBlock); f += blockDim.x * gridDim.x) {
-                    	auto y0 = shape::getIndexOffset(f, yTadShapeInfo, yTadLength);
+                    	auto y0 = shape::getIndexOffset(f, yTadShapeInfo);
                     	sPartials[threadIdx.x] = OpType::update(sPartials[threadIdx.x], OpType::opAtomic(tempX[threadIdx.x], y[y0], extraZ), extraZ);
                     }
 
@@ -433,8 +433,8 @@ __device__ void Reduce3<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
 
             for (int j = threadIdx.x; j < tadLen; j += blockDim.x) {
 
-                Nd4jLong xOffset2 =  xOffset + shape::getIndexOffset(j, tadOnlyShapeInfo, tadLen);
-                Nd4jLong yOffset2 =  yOffset + shape::getIndexOffset(j, yTadOnlyShapeInfo, tadLen);
+                Nd4jLong xOffset2 =  xOffset + shape::getIndexOffset(j, tadOnlyShapeInfo);
+                Nd4jLong yOffset2 =  yOffset + shape::getIndexOffset(j, yTadOnlyShapeInfo);
                 sPartials[threadIdx.x] =  j < blockDim.x ? OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ) : OpType::update(sPartials[threadIdx.x], OpType::opAtomic(x[xOffset2], y[yOffset2], extraZ), extraZ);
 
             }
diff --git a/libnd4j/include/loops/cuda/scalar.chpp b/libnd4j/include/loops/cuda/scalar.chpp
index 503e5c97a..7277138f9 100644
--- a/libnd4j/include/loops/cuda/scalar.chpp
+++ b/libnd4j/include/loops/cuda/scalar.chpp
@@ -33,7 +33,7 @@ using namespace simdOps;
 ////////////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y, typename Z, typename OpType>
 __global__ static void scalarSimpleShaped(void* vx, void *vscalar, Nd4jLong *xShapeInfo, void *vparams, void *vz, Nd4jLong *zShapeInfo, int *allocationBuffer) {
-    
+
     auto scalar = reinterpret_cast<Y*>(vscalar)[0];
     auto x      = reinterpret_cast<X*>(vx);
     auto params = reinterpret_cast<Z*>(vparams);
@@ -61,10 +61,10 @@ __global__ static void scalarSimpleShaped(void* vx, void *vscalar, Nd4jLong *xSh
         }
     } else {
         for (Nd4jLong i = tid; i < length; i += totalThreads) {
-            z[shape::getIndexOffset(i, zShapeInfo, length)] = OpType::op(x[shape::getIndexOffset(i, xShapeInfo, length)], scalar, params);
+            z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(x[shape::getIndexOffset(i, xShapeInfo)], scalar, params);
         }
     }
-    
+
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -76,7 +76,7 @@ __global__ static void scalarAlongDimension(void *vx, Nd4jLong *xShapeInfo,
                                           int *dimension, int dimensionLength,
                                           Nd4jLong *tadShapeInfo,  Nd4jLong *tadOffsets,
                                           Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-        
+
     auto x = reinterpret_cast<X*>(vx);
     auto extraParams = reinterpret_cast<Z*>(vextraParams);
     auto z = reinterpret_cast<Z*>(vz);
@@ -114,7 +114,7 @@ __global__ static void scalarAlongDimension(void *vx, Nd4jLong *xShapeInfo,
             auto s = scalars[r];
 
             for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
-                oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams);
+                oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams);
         }
     }
 }
@@ -127,7 +127,7 @@ namespace scalar    {
 template<typename X, typename Y, typename Z>
 template<typename OpType>
 void _CUDA_H ScalarTransform<X,Y,Z>::intermediateShaped(dim3& launchDims, cudaStream_t *stream, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, void* vscalar, void *vextraParams, int *allocPointer){
-    
+
     auto xEws = shape::elementWiseStride(hxShapeInfo);
     auto xOrder = shape::order(hxShapeInfo);
 
diff --git a/libnd4j/include/loops/cuda/scalar_bool.cu b/libnd4j/include/loops/cuda/scalar_bool.cu
index c6563c9ef..37939b9b9 100644
--- a/libnd4j/include/loops/cuda/scalar_bool.cu
+++ b/libnd4j/include/loops/cuda/scalar_bool.cu
@@ -36,7 +36,7 @@ __global__ void scalarAlongDimension(void *x, Nd4jLong *xShapeInfo,
                                     int *dimension, int dimensionLength,
                                     Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                     Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-        
+
     functions::scalar::ScalarBoolTransform<X,Z>::template transformCuda<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ);
 }
 
@@ -60,10 +60,10 @@ namespace scalar    {
 ////////////////////////////////////////////////////////////////////////
 template<typename X, typename Z>
 template<typename OpType>
-__device__ void  ScalarBoolTransform<X, Z>::transformCuda(void* vscalar, 
-                                                        void *vy, Nd4jLong *yShapeInfo, 
-                                                        void *vparams, 
-                                                        void *vz, Nd4jLong *zShapeInfo, 
+__device__ void  ScalarBoolTransform<X, Z>::transformCuda(void* vscalar,
+                                                        void *vy, Nd4jLong *yShapeInfo,
+                                                        void *vparams,
+                                                        void *vz, Nd4jLong *zShapeInfo,
                                                         int *allocationBuffer) {
     auto scalar = reinterpret_cast<X*>(vscalar)[0];
     auto y      = reinterpret_cast<X*>(vy);
@@ -73,8 +73,8 @@ __device__ void  ScalarBoolTransform<X, Z>::transformCuda(void* vscalar,
     auto yRank   = shape::rank(yShapeInfo);
     auto yEWS    = shape::elementWiseStride(yShapeInfo);
     auto yShape  = shape::shapeOf(yShapeInfo);
-    auto yStride = shape::stride(yShapeInfo);        
-    
+    auto yStride = shape::stride(yShapeInfo);
+
     auto zRank   = shape::rank(zShapeInfo);
     auto zEWS    = shape::elementWiseStride(zShapeInfo);
     auto zShape  = shape::shapeOf(zShapeInfo);
@@ -89,22 +89,22 @@ __device__ void  ScalarBoolTransform<X, Z>::transformCuda(void* vscalar,
     __syncthreads();
 
     if(yEWS >= 1 && zEWS >= 1 && shape::order(yShapeInfo) == shape::order(zShapeInfo)) {
-            transformCuda<OpType>(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer);    
+            transformCuda<OpType>(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer);
     }
     else {
-        for (Nd4jLong i = tid; i < len; i+= totalThreads)                        
-            z[shape::getIndexOffset(i, zShapeInfo, len)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, len)], scalar, params);
+        for (Nd4jLong i = tid; i < len; i+= totalThreads)
+            z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params);
     }
 }
 
 ////////////////////////////////////////////////////////////////////////
 template<typename X, typename Z>
 template<typename OpType>
-__device__ void  ScalarBoolTransform<X, Z>::transformCuda(Nd4jLong len, 
-                                                          void* vx, 
-                                                          void *vy, Nd4jLong yEWS, 
-                                                          void *vparams, 
-                                                          void *vz, Nd4jLong zEWS, 
+__device__ void  ScalarBoolTransform<X, Z>::transformCuda(Nd4jLong len,
+                                                          void* vx,
+                                                          void *vy, Nd4jLong yEWS,
+                                                          void *vparams,
+                                                          void *vz, Nd4jLong zEWS,
                                                           int *allocationBuffer) {
 
     auto x = reinterpret_cast<X*>(vx)[0];
@@ -130,18 +130,18 @@ __device__ void  ScalarBoolTransform<X, Z>::transformCuda(Nd4jLong len,
 ////////////////////////////////////////////////////////////////////////
 template<typename X, typename Z>
 template<typename OpType>
-__device__ void  ScalarBoolTransform<X, Z>::transformCuda(void *vx, Nd4jLong *xShapeInfo, 
-                                                        void *vextraParams, 
-                                                        void *vz, Nd4jLong *zShapeInfo, 
-                                                        void *vscalars, 
-                                                        int *dimension, int dimensionLength, 
-                                                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, 
+__device__ void  ScalarBoolTransform<X, Z>::transformCuda(void *vx, Nd4jLong *xShapeInfo,
+                                                        void *vextraParams,
+                                                        void *vz, Nd4jLong *zShapeInfo,
+                                                        void *vscalars,
+                                                        int *dimension, int dimensionLength,
+                                                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                                         Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
     auto x = reinterpret_cast<X*>(vx);
     auto scalars = reinterpret_cast<X*>(vscalars);
     auto z = reinterpret_cast<Z*>(vz);
     auto extraParams = reinterpret_cast<X*>(vextraParams);
-    
+
     if (tadShapeInfoZ == nullptr) {
         tadShapeInfoZ = tadShapeInfo;
         tadOffsetsZ = tadOffsets;
@@ -174,7 +174,7 @@ __device__ void  ScalarBoolTransform<X, Z>::transformCuda(void *vx, Nd4jLong *xS
             auto s = scalars[r];
 
             for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
-                oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams);
+                oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams);
         }
     }
 }
@@ -184,12 +184,12 @@ __device__ void  ScalarBoolTransform<X, Z>::transformCuda(void *vx, Nd4jLong *xS
 template<typename X, typename Z>
 template <typename OpType>
 _CUDA_H void ScalarBoolTransform<X, Z>::intermediateAlongDimension(dim3& launchDims, cudaStream_t *stream,
-                                                                void *x, Nd4jLong *xShapeInfo, 
-                                                                void *z, Nd4jLong *zShapeInfo, 
-                                                                void *scalars, 
-                                                                void *extraParams, 
-                                                                int *dimension, int dimensionLength, 
-                                                                Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, 
+                                                                void *x, Nd4jLong *xShapeInfo,
+                                                                void *z, Nd4jLong *zShapeInfo,
+                                                                void *scalars,
+                                                                void *extraParams,
+                                                                int *dimension, int dimensionLength,
+                                                                Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                                                 Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
     scalarAlongDimension<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ);
@@ -200,11 +200,11 @@ _CUDA_H void ScalarBoolTransform<X, Z>::intermediateAlongDimension(dim3& launchD
 template<typename X, typename Z>
 template<typename OpType>
 void _CUDA_H ScalarBoolTransform<X,Z>::intermediateShaped(dim3& launchDims, cudaStream_t *stream,
-                                                            void *vx, Nd4jLong *xShapeInfo, 
-                                                            void *vz, Nd4jLong *zShapeInfo, 
-                                                            void* vscalar, 
+                                                            void *vx, Nd4jLong *xShapeInfo,
+                                                            void *vz, Nd4jLong *zShapeInfo,
+                                                            void* vscalar,
                                                             void *vextraParams, int *allocPointer){
-    
+
     scalarSimpleShaped<X, Z, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vx, vscalar, xShapeInfo, vextraParams, vz, zShapeInfo, allocPointer);
     nd4j::DebugHelper::checkErrorCode(stream, "scalarSimpleShaped(...) failed");
 }
@@ -212,10 +212,10 @@ void _CUDA_H ScalarBoolTransform<X,Z>::intermediateShaped(dim3& launchDims, cuda
 ////////////////////////////////////////////////////////////////////////
 template<typename X, typename Y>
 void ScalarBoolTransform<X,Y>::executeCudaShaped(dim3& launchDims, cudaStream_t *stream,
-                                                int opNum, 
-                                                void *vx, Nd4jLong *xShapeInfo, 
-                                                void *vz, Nd4jLong *zShapeInfo, 
-                                                void* vscalar, 
+                                                int opNum,
+                                                void *vx, Nd4jLong *xShapeInfo,
+                                                void *vz, Nd4jLong *zShapeInfo,
+                                                void* vscalar,
                                                 void *vextraParams) {
 
     if (nd4j::Environment::getInstance()->isDebugAndVerbose())
diff --git a/libnd4j/include/loops/cuda/scalar_int.cu b/libnd4j/include/loops/cuda/scalar_int.cu
index 48f141525..44c73fcb4 100644
--- a/libnd4j/include/loops/cuda/scalar_int.cu
+++ b/libnd4j/include/loops/cuda/scalar_int.cu
@@ -36,7 +36,7 @@ __global__ void scalarAlongDimension(void *x, Nd4jLong *xShapeInfo,
                                     int *dimension, int dimensionLength,
                                     Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                     Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
-        
+
     functions::scalar::ScalarIntTransform<X>::template transformCuda<OpType>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ);
 }
 
@@ -60,10 +60,10 @@ namespace scalar    {
 ////////////////////////////////////////////////////////////////////////
 template<typename X>
 template<typename OpType>
-__device__ void  ScalarIntTransform<X>::transformCuda(void* vscalar, 
-                                                        void *vy, Nd4jLong *yShapeInfo, 
-                                                        void *vparams, 
-                                                        void *vz, Nd4jLong *zShapeInfo, 
+__device__ void  ScalarIntTransform<X>::transformCuda(void* vscalar,
+                                                        void *vy, Nd4jLong *yShapeInfo,
+                                                        void *vparams,
+                                                        void *vz, Nd4jLong *zShapeInfo,
                                                         int *allocationBuffer) {
     auto scalar = reinterpret_cast<X*>(vscalar)[0];
     auto y      = reinterpret_cast<X*>(vy);
@@ -73,8 +73,8 @@ __device__ void  ScalarIntTransform<X>::transformCuda(void* vscalar,
     auto yRank   = shape::rank(yShapeInfo);
     auto yEWS    = shape::elementWiseStride(yShapeInfo);
     auto yShape  = shape::shapeOf(yShapeInfo);
-    auto yStride = shape::stride(yShapeInfo);        
-    
+    auto yStride = shape::stride(yShapeInfo);
+
     auto zRank   = shape::rank(zShapeInfo);
     auto zEWS    = shape::elementWiseStride(zShapeInfo);
     auto zShape  = shape::shapeOf(zShapeInfo);
@@ -89,11 +89,11 @@ __device__ void  ScalarIntTransform<X>::transformCuda(void* vscalar,
     __syncthreads();
 
     if(yEWS >= 1 && zEWS >= 1 && shape::order(yShapeInfo) == shape::order(zShapeInfo)) {
-            transformCuda<OpType>(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer);    
+            transformCuda<OpType>(len, vscalar, vy, yEWS, vparams, vz, zEWS, allocationBuffer);
     }
     else {
-        for (Nd4jLong i = tid; i < len; i+= totalThreads)                        
-            z[shape::getIndexOffset(i, zShapeInfo, len)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo, len)], scalar, params);
+        for (Nd4jLong i = tid; i < len; i+= totalThreads)
+            z[shape::getIndexOffset(i, zShapeInfo)] = OpType::op(y[shape::getIndexOffset(i, yShapeInfo)], scalar, params);
     }
 }
 
@@ -101,10 +101,10 @@ __device__ void  ScalarIntTransform<X>::transformCuda(void* vscalar,
 template<typename X>
 template<typename OpType>
 __device__ void  ScalarIntTransform<X>::transformCuda(Nd4jLong len,
-                                                          void* vx, 
-                                                          void *vy, Nd4jLong yEWS, 
-                                                          void *vparams, 
-                                                          void *vz, Nd4jLong zEWS, 
+                                                          void* vx,
+                                                          void *vy, Nd4jLong yEWS,
+                                                          void *vparams,
+                                                          void *vz, Nd4jLong zEWS,
                                                           int *allocationBuffer) {
 
     auto x = reinterpret_cast<X*>(vx)[0];
@@ -131,17 +131,17 @@ __device__ void  ScalarIntTransform<X>::transformCuda(Nd4jLong len,
 template<typename X>
 template<typename OpType>
 __device__ void  ScalarIntTransform<X>::transformCuda(void *vx, Nd4jLong *xShapeInfo,
-                                                        void *vextraParams, 
-                                                        void *vz, Nd4jLong *zShapeInfo, 
-                                                        void *vscalars, 
-                                                        int *dimension, int dimensionLength, 
-                                                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, 
+                                                        void *vextraParams,
+                                                        void *vz, Nd4jLong *zShapeInfo,
+                                                        void *vscalars,
+                                                        int *dimension, int dimensionLength,
+                                                        Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                                         Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
     auto x = reinterpret_cast<X*>(vx);
     auto scalars = reinterpret_cast<X*>(vscalars);
     auto z = reinterpret_cast<X*>(vz);
     auto extraParams = reinterpret_cast<X*>(vextraParams);
-    
+
     if (tadShapeInfoZ == nullptr) {
         tadShapeInfoZ = tadShapeInfo;
         tadOffsetsZ = tadOffsets;
@@ -174,7 +174,7 @@ __device__ void  ScalarIntTransform<X>::transformCuda(void *vx, Nd4jLong *xShape
             auto s = scalars[r];
 
             for (int f = threadIdx.x; f < tadLength; f += blockDim.x)
-                oZ[shape::getIndexOffset(f, tadShapeInfoZ, tadLength)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo, tadLength)], s, extraParams);
+                oZ[shape::getIndexOffset(f, tadShapeInfoZ)] = OpType::op(oX[shape::getIndexOffset(f, tadShapeInfo)], s, extraParams);
         }
     }
 }
@@ -184,12 +184,12 @@ __device__ void  ScalarIntTransform<X>::transformCuda(void *vx, Nd4jLong *xShape
 template<typename X>
 template <typename OpType>
 _CUDA_H void ScalarIntTransform<X>::intermediateAlongDimension(dim3& launchDims, cudaStream_t *stream,
-                                                                void *x, Nd4jLong *xShapeInfo, 
-                                                                void *z, Nd4jLong *zShapeInfo, 
-                                                                void *scalars, 
-                                                                void *extraParams, 
-                                                                int *dimension, int dimensionLength, 
-                                                                Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, 
+                                                                void *x, Nd4jLong *xShapeInfo,
+                                                                void *z, Nd4jLong *zShapeInfo,
+                                                                void *scalars,
+                                                                void *extraParams,
+                                                                int *dimension, int dimensionLength,
+                                                                Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets,
                                                                 Nd4jLong *tadShapeInfoZ, Nd4jLong *tadOffsetsZ) {
 
     scalarAlongDimension<X, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(x, xShapeInfo, extraParams, z, zShapeInfo, scalars, dimension, dimensionLength, tadShapeInfo, tadOffsets, tadShapeInfoZ, tadOffsetsZ);
@@ -199,21 +199,21 @@ _CUDA_H void ScalarIntTransform<X>::intermediateAlongDimension(dim3& launchDims,
 template<typename X>
 template<typename OpType>
 void _CUDA_H ScalarIntTransform<X>::intermediateShaped(dim3& launchDims, cudaStream_t *stream,
-                                                            void *vx, Nd4jLong *xShapeInfo, 
-                                                            void *vz, Nd4jLong *zShapeInfo, 
-                                                            void* vscalar, 
+                                                            void *vx, Nd4jLong *xShapeInfo,
+                                                            void *vz, Nd4jLong *zShapeInfo,
+                                                            void* vscalar,
                                                             void *vextraParams, int *allocPointer){
-    
+
     scalarSimpleShaped<X, OpType><<<launchDims.x, launchDims.y, launchDims.z, *stream>>>(vx, vscalar, xShapeInfo, vextraParams, vz, zShapeInfo, allocPointer);
 }
 
 ////////////////////////////////////////////////////////////////////////
 template<typename X>
 void ScalarIntTransform<X>::executeCudaShaped(dim3& launchDims, cudaStream_t *stream,
-                                                int opNum, 
-                                                void *vx, Nd4jLong *xShapeInfo, 
-                                                void *vz, Nd4jLong *zShapeInfo, 
-                                                void* vscalar, 
+                                                int opNum,
+                                                void *vx, Nd4jLong *xShapeInfo,
+                                                void *vz, Nd4jLong *zShapeInfo,
+                                                void* vscalar,
                                                 void *vextraParams) {
 
     if (nd4j::Environment::getInstance()->isDebugAndVerbose())
diff --git a/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu b/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu
index 8ee950c25..13ad1d5b4 100644
--- a/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu
+++ b/libnd4j/include/loops/cuda/specials/bitonicArbitraryStep.cu
@@ -80,8 +80,8 @@ __global__ void bitonicArbitraryStepKernelKey(void *vx, Nd4jLong *xShapeInfo, vo
             int it = (reverse) ? i + j + half : i + window - j - 1;
             int ij = i+j;
             if (it < length && ij < length ) {
-                int posIT = shape::getIndexOffset(it, xShapeInfo, xLength);
-                int posIJ = shape::getIndexOffset(ij, xShapeInfo, xLength);
+                int posIT = shape::getIndexOffset(it, xShapeInfo);
+                int posIJ = shape::getIndexOffset(ij, xShapeInfo);
 
                 X v0 = x[posIJ];
                 X v1 = x[posIT];
@@ -160,8 +160,8 @@ __global__ void execBitonicArbitraryStepKernel(void *vx, Nd4jLong *xShapeInfo, i
             int it = (reverse) ? i + j + half : i + window - j - 1;
             int ij = i+j;
             if (it < length && ij < length ) {
-                int posIT = shape::getIndexOffset(it, xShapeInfo, xLength);
-                int posIJ = shape::getIndexOffset(ij, xShapeInfo, xLength);
+                int posIT = shape::getIndexOffset(it, xShapeInfo);
+                int posIJ = shape::getIndexOffset(ij, xShapeInfo);
 
                 shmem[threadIdx.x] = x[posIJ];
                 shmem[threadIdx.x + blockDim.x] = x[posIT];
diff --git a/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu b/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu
index d9b2ec74c..6bd1e8a33 100644
--- a/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu
+++ b/libnd4j/include/loops/cuda/specials/bitonicSortStep.cu
@@ -46,8 +46,8 @@ __global__ void bitonicSortStepKernelKey(void *vx, Nd4jLong *xShapeInfo, void *v
 
     /* The threads with the lowest ids sort the array. */
     if ((ixj)>i) {
-        int posI = shape::getIndexOffset(i, xShapeInfo, xLength);
-        int posIXJ = shape::getIndexOffset(ixj, xShapeInfo, xLength);
+        int posI = shape::getIndexOffset(i, xShapeInfo);
+        int posIXJ = shape::getIndexOffset(ixj, xShapeInfo);
 
         if ((i&k)==0) {
             /* Sort ascending */
@@ -100,8 +100,8 @@ __global__ void bitonicSortStepKernel(void *vx, Nd4jLong *xShapeInfo, int j, int
 
     /* The threads with the lowest ids sort the array. */
     if ((ixj)>i) {
-        int posI = shape::getIndexOffset(i, xShapeInfo, xLength);
-        int posIXJ = shape::getIndexOffset(ixj, xShapeInfo, xLength);
+        int posI = shape::getIndexOffset(i, xShapeInfo);
+        int posIXJ = shape::getIndexOffset(ixj, xShapeInfo);
 
         if ((i&k)==0) {
             /* Sort ascending */
diff --git a/libnd4j/include/loops/cuda/specials/concatKernel.cu b/libnd4j/include/loops/cuda/specials/concatKernel.cu
index 5d788c4d1..b6ba2f00e 100644
--- a/libnd4j/include/loops/cuda/specials/concatKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/concatKernel.cu
@@ -139,19 +139,19 @@ namespace nd4j {
 
                     Nd4jLong sub[MAX_RANK];
 
-                    shape::index2coords(shape::rank(zTadShape),shape::shapeOf(zTadShape), arrOffset, sub, shape::order(zTadShape));
-                    
-                    Nd4jLong baseOffset = shape::getOffset(0,shape::shapeOf(zTadShape),shape::stride(zTadShape), sub, shape::rank(zTadShape));
+                    shape::index2coords(arrOffset, zTadShape, sub);
+
+                    Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
 
                     resultTAD += baseOffset;
 
                     auto yRank = shape::rank(currentTad);
                     auto tadRank = shape::rank(zTadShape);
 
-                    shape::index2coords(yRank, shape::shapeOf(currentTad), 0, sub);
+                    shape::index2coords(0, currentTad, sub);
 
-                    auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), sub, yRank);
-                    resultOffset = shape::getOffset(0, shape::shapeOf(zTadShape), shape::stride(zTadShape), sub, tadRank);
+                    auto yOffset = shape::getOffset(currentTad, sub);
+                    resultOffset = shape::getOffset(zTadShape, sub);
 
                     resultTAD[resultOffset] =  dataTAD[yOffset];
                 }
@@ -168,8 +168,8 @@ namespace nd4j {
 
                     Nd4jLong sub[MAX_RANK];
 
-                    shape::index2coords(shape::rank(zTadShape),shape::shapeOf(zTadShape), arrOffset, sub);
-                    Nd4jLong baseOffset = shape::getOffset(0,shape::shapeOf(zTadShape),shape::stride(zTadShape), sub, shape::rank(zTadShape));
+                    shape::index2coords(arrOffset, zTadShape, sub);
+                    Nd4jLong baseOffset = shape::getOffset(zTadShape, sub);
 
                     resultTAD += baseOffset;
 
@@ -203,8 +203,8 @@ namespace nd4j {
                                 auto yRank = shape::rank(currentTad);
 
                                 for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
-                                    shape::index2coords(yRank, shape::shapeOf(currentTad), i, yIdx);
-                                    auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), yIdx, yRank);
+                                    shape::index2coords(i, currentTad, yIdx);
+                                    auto yOffset = shape::getOffset(currentTad, yIdx);
 
                                     resultTAD[baseIdx + i * tadEWS] =  dataTAD[yOffset];
                                 }
@@ -220,11 +220,11 @@ namespace nd4j {
                             auto tadRank = shape::rank(zTadShape);
 
                             for (int i = threadIdx.x; i < yLength; i+= blockDim.x) {
-                                shape::index2coords(yRank, shape::shapeOf(currentTad), i, yIdx);
-                                shape::index2coords(tadRank, shape::shapeOf(zTadShape), i, zIdx);
+                                shape::index2coords(i, currentTad, yIdx);
+                                shape::index2coords(i, zTadShape, zIdx);
 
-                                auto yOffset = shape::getOffset(0, shape::shapeOf(currentTad), shape::stride(currentTad), yIdx, yRank);
-                                auto resultOffset = shape::getOffset(0, shape::shapeOf(zTadShape), shape::stride(zTadShape), zIdx, tadRank);
+                                auto yOffset = shape::getOffset(currentTad, yIdx);
+                                auto resultOffset = shape::getOffset(zTadShape, zIdx);
 
                                 resultTAD[resultOffset] =  dataTAD[yOffset];
                             }
diff --git a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
index 70da24715..e39ff6bec 100644
--- a/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
+++ b/libnd4j/include/loops/cuda/specials/fillDimensionalIsMax.cu
@@ -53,7 +53,7 @@ namespace nd4j {
             if (dimensionLength > 1 || tadEWS < 1) {
 
                 for (Nd4jLong e = threadIdx.x; e < tadLength; e += blockDim.x) {
-                    auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo, tadLength);
+                    auto xOffset = tadOffsetForBlock + shape::getIndexOffset(e, tadOnlyShapeInfo);
                     dZ[xOffset] = (e == highestElement ? (T) 1 : (T) 0);
                 }
             } else {
diff --git a/libnd4j/include/loops/cuda/specials/fillIsMax.cu b/libnd4j/include/loops/cuda/specials/fillIsMax.cu
index 0851968ba..c9ed51d28 100644
--- a/libnd4j/include/loops/cuda/specials/fillIsMax.cu
+++ b/libnd4j/include/loops/cuda/specials/fillIsMax.cu
@@ -30,7 +30,7 @@ namespace nd4j {
         int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
         for (Nd4jLong i = tid; i < length; i += blockDim.x * gridDim.x)
-            dz[shape::getIndexOffset(i, xShapeInfo, length)] = (i == idx ? (T) 1 : (T) 0);
+            dz[shape::getIndexOffset(i, xShapeInfo)] = (i == idx ? (T) 1 : (T) 0);
     }
 
 ////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/loops/cuda/specials/flatten.cu b/libnd4j/include/loops/cuda/specials/flatten.cu
index b820acae9..faec2ec90 100644
--- a/libnd4j/include/loops/cuda/specials/flatten.cu
+++ b/libnd4j/include/loops/cuda/specials/flatten.cu
@@ -20,6 +20,7 @@
 //
 
 #include <loops/special_kernels.h>
+#include <ops/declarable/helpers/flatten.h>
 
 namespace nd4j {
 
@@ -34,34 +35,26 @@ __global__ void flattenKernel(
 
     auto z = reinterpret_cast<T*>(vz);
     auto y = reinterpret_cast<T*>(vy);
-    
+
     __shared__ Nd4jLong lenY, yOrder, zEWS, yEWS;
 
-    if (threadIdx.x == 0) {                
-        
+    if (threadIdx.x == 0) {
+
         yEWS = shape::elementWiseStride(yShapeInfo);
         zEWS = shape::elementWiseStride(zShapeInfo);
         lenY = shape::length(yShapeInfo);
     }
     __syncthreads();
 
-    Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;        
-        
-    if (zEWS >= 1 && yEWS >= 1 && yOrder == order) {
- 
-        for (int i = tid; i < lenY; i += gridDim.x * blockDim.x)
-            z[i * zEWS + dOffset] = y[i * yEWS];
-    } 
-    else {
-        
-        for(auto i = tid; i < lenY; i += gridDim.x * blockDim.x)
-            z[i * zEWS + dOffset] = y[shape::getIndexOrderOffset(i, yShapeInfo, lenY, order)];
-    } 
+    Nd4jLong tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(auto i = tid; i < lenY; i += gridDim.x * blockDim.x)
+        z[i * zEWS + dOffset] = y[ops::helpers::getIndexOffsetOrdered(i, yShapeInfo, order)];
 }
 
 ////////////////////////////////////////////////////////////////////////
 template <typename T>
-__host__ void flattenKernelGeneric(dim3& launchDims, cudaStream_t *stream, 
+__host__ void flattenKernelGeneric(dim3& launchDims, cudaStream_t *stream,
                             Nd4jPointer *extraPointers,
                             int dOffset,
                             char order,
diff --git a/libnd4j/include/loops/cuda/specials/oesTad.cu b/libnd4j/include/loops/cuda/specials/oesTad.cu
index 8cc6c0766..9f41ffbb9 100644
--- a/libnd4j/include/loops/cuda/specials/oesTad.cu
+++ b/libnd4j/include/loops/cuda/specials/oesTad.cu
@@ -54,8 +54,8 @@ __global__ void execOesTadKernelKey(void *vx, Nd4jLong *xShapeInfo,
                 for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
                     auto top = 2 * tid + 1;
                     if (top < xTadLength) {
-                        auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
-                        auto t1 = shape::getIndexOffset(top, tadShapeInfo, xTadLength);
+                        auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo);
+                        auto t1 = shape::getIndexOffset(top, tadShapeInfo);
 
                         if (!descending == (dx[t0] > dx[t1])) {
                             X dt0 = dx[t0];
@@ -72,8 +72,8 @@ __global__ void execOesTadKernelKey(void *vx, Nd4jLong *xShapeInfo,
                 for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
                     auto top = 2 * tid + 2;
                     if (top < xTadLength) {
-                        auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
-                        auto t1 = shape::getIndexOffset(top, tadShapeInfo, xTadLength);
+                        auto t0 = shape::getIndexOffset(top - 1, tadShapeInfo);
+                        auto t1 = shape::getIndexOffset(top, tadShapeInfo);
 
                         if (!descending == (dx[t0] > dx[t1])) {
                             X dt0 = dx[t0];
@@ -126,7 +126,7 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
         int iterations = xTadLength;
         if (cached) {
             for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
-                auto t0 = shape::getIndexOffset(tid, tadShapeInfo, xTadLength);
+                auto t0 = shape::getIndexOffset(tid, tadShapeInfo);
                 shmem[tid] = dx[t0];
             }
 
@@ -140,8 +140,8 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
                 for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
                     auto top = 2 * tid + 1;
                     if (top < xTadLength) {
-                        auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
-                        auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo, xTadLength);
+                        auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo);
+                        auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo);
 
                         if (!descending == (dx[t0] > dx[t1])) {
                             T dt0 = dx[t0];
@@ -154,8 +154,8 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
                 for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
                     auto top = 2 * tid + 2;
                     if (top < xTadLength) {
-                        auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo, xTadLength);
-                        auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo, xTadLength);
+                        auto t0 = cached ? top - 1 : shape::getIndexOffset(top - 1, tadShapeInfo);
+                        auto t1 = cached ? top : shape::getIndexOffset(top, tadShapeInfo);
 
                         if (!descending == (dx[t0] > dx[t1])) {
                             T dt0 = dx[t0];
@@ -172,7 +172,7 @@ __global__ void execOesTadKernel(void *vx, Nd4jLong *xShapeInfo,
         if (cached) {
             dx = x + tadOffsets[r];
             for (int tid = threadIdx.x; tid < xTadLength; tid += blockDim.x) {
-                auto t0 = shape::getIndexOffset(tid, tadShapeInfo, xTadLength);
+                auto t0 = shape::getIndexOffset(tid, tadShapeInfo);
                 dx[t0] = shmem[tid];
             }
         }
diff --git a/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu b/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu
index 39db62099..9730565e6 100644
--- a/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/pullRowsKernel.cu
@@ -53,8 +53,8 @@ namespace nd4j {
                 T *rZ = z + zTadOffsets[idx];
 
                 for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-                    auto xOffset = shape::getIndexOffset(i, tadShapeInfo, tadLength);
-                    auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, tadLength);
+                    auto xOffset = shape::getIndexOffset(i, tadShapeInfo);
+                    auto zOffset = shape::getIndexOffset(i, zTadShapeInfo);
                     rZ[zOffset] = rX[xOffset];
                 }
             }
diff --git a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
index adea72bc4..9bf79bedf 100644
--- a/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/setDiagonalKernel.cu
@@ -33,7 +33,7 @@ namespace nd4j {
         for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
             for (int j = threadIdx.x; j < cols; j += totalThreads) {
                 Nd4jLong coords[2] = {i, j};
-                Nd4jLong xOffset = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), coords, rank);
+                Nd4jLong xOffset = shape::getOffset(shape, coords);
                 if (i + diagonal <= j)
                     array[xOffset] = value;
             }
@@ -48,7 +48,7 @@ namespace nd4j {
         for (Nd4jLong i = blockIdx.x; i < rows; i += gridDim.x) {
             for (int j = threadIdx.x; j < cols; j += totalThreads) {
                 Nd4jLong coords[2] = {i, j};
-                auto xOffset = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), coords, rank);
+                auto xOffset = shape::getOffset(shape, coords);
                 if (i + diagonal >= j)
                     *(reinterpret_cast<T*>(buffer) + xOffset) = value;
             }
diff --git a/libnd4j/include/loops/cuda/specials/shuffleKernel.cu b/libnd4j/include/loops/cuda/specials/shuffleKernel.cu
index 6e7d4ad3b..c842cad4a 100644
--- a/libnd4j/include/loops/cuda/specials/shuffleKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/shuffleKernel.cu
@@ -92,7 +92,7 @@ namespace nd4j {
                         } else {
                             for (Nd4jLong i = threadIdx.x; i < tadLength; i += blockDim.x) {
 
-                                auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo[f], tadLength);
+                                auto xOffset = shape::getIndexOffset(i, tadOnlyShapeInfo[f]);
                                 auto yOffset = newOffset + xOffset;
                                 xOffset += oldOffset;
 
diff --git a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
index bf49d788e..fd36eec00 100644
--- a/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/swapUnsafeKernel.cu
@@ -34,8 +34,8 @@ namespace nd4j {
             auto xEws = shape::order(theFirstShape)  == 'c'? shape::elementWiseStride(theFirstShape) :1;
             auto yEws = shape::order(theSecondShape) == 'c'? shape::elementWiseStride(theSecondShape):1;
             //if (shape::order(theFirstShape) ==)
-            auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape, resultLength);
-            auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape, resultLength);
+            auto xOffset = shape::getIndexOffset(i * xEws, theFirstShape);
+            auto yOffset = shape::getIndexOffset(i * yEws, theSecondShape);
             T temp = *(reinterpret_cast<T*>(theFirstBuffer) + xOffset);
             *(reinterpret_cast<T*>(theFirstBuffer) + xOffset) = *(reinterpret_cast<T*>(theSecondBuffer) + yOffset);
             *(reinterpret_cast<T*>(theSecondBuffer) + yOffset) = temp;
diff --git a/libnd4j/include/loops/cuda/specials/tearKernel.cu b/libnd4j/include/loops/cuda/specials/tearKernel.cu
index 9f78f14da..e12aa485f 100644
--- a/libnd4j/include/loops/cuda/specials/tearKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/tearKernel.cu
@@ -61,8 +61,8 @@ namespace nd4j {
             } else {
 
                 for (Nd4jLong j = threadIdx.x; j < tadLength; j += blockDim.x) {
-                    auto xOffset = shape::getIndexOffset(j, tadShapeInfo, tadLength);
-                    auto zOffset = shape::getIndexOffset(j, zShapeInfo, tadLength);
+                    auto xOffset = shape::getIndexOffset(j, tadShapeInfo);
+                    auto zOffset = shape::getIndexOffset(j, zShapeInfo);
 
                     z[zOffset] = s[xOffset];
                 }
diff --git a/libnd4j/include/loops/cuda/specials/tileKernel.cu b/libnd4j/include/loops/cuda/specials/tileKernel.cu
index 7d2e87e2d..0a62313f0 100644
--- a/libnd4j/include/loops/cuda/specials/tileKernel.cu
+++ b/libnd4j/include/loops/cuda/specials/tileKernel.cu
@@ -21,8 +21,8 @@
 #include <loops/special_kernels.h>
 
 namespace nd4j {
-    static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo, Nd4jLong length) {
-        return shape::getIndexOffset(index, shapeInfo, length);
+    static Nd4jLong __device__ __noinline__ _getIndexOffset(Nd4jLong index, Nd4jLong *shapeInfo) {
+        return shape::getIndexOffset(index, shapeInfo);
     }
 
     static Nd4jLong __device__ __noinline__ _subArrayOffset(Nd4jLong index, Nd4jLong *shapeInfoA, Nd4jLong *shapeInfoB) {
@@ -50,7 +50,7 @@ namespace nd4j {
             }
         } else {
             for (int i = tid; i < resultLength; i += totalThreads) {
-                auto xOffset = _getIndexOffset(i, outputShape, resultLength);
+                auto xOffset = _getIndexOffset(i, outputShape);
                 auto yOffset = _subArrayOffset(i, outputShape, inputShape);
                 *(reinterpret_cast<T *>(outputBuffer) + xOffset) = *(reinterpret_cast<T const *>(inputBuffer) + yOffset);
             }
@@ -89,7 +89,7 @@ namespace nd4j {
 
             for (int i = tid; i < resultLength; i += totalThreads) {
 
-                auto xOffset = _getIndexOffset(i, outputShape, resultLength);
+                auto xOffset = _getIndexOffset(i, outputShape);
                 auto yOffset = _subArrayOffset(i, outputShape, inputShape);
                 *(reinterpret_cast<X *>(outputBuffer) + xOffset) = static_cast<X>(*(reinterpret_cast<Y const *>(inputBuffer) + yOffset));
             }
diff --git a/libnd4j/include/loops/cuda/summarystatsreduce.cu b/libnd4j/include/loops/cuda/summarystatsreduce.cu
index deca80217..4867f5de1 100644
--- a/libnd4j/include/loops/cuda/summarystatsreduce.cu
+++ b/libnd4j/include/loops/cuda/summarystatsreduce.cu
@@ -40,7 +40,7 @@ namespace functions {
 
 template <typename X, typename Z>
 void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRank, void *extraParams, void *z, Nd4jLong *zShapeInfo, int zRank, int *dimension, int dimensionLength, int postProcessOrNot,bool biasCorrected,int *allocationBuffer, void *reductionBuffer, Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
-            
+
     functions::summarystats::SummaryStatsReduce<X,Z>::transform(op,dx,xShapeInfo,extraParams,z,zShapeInfo,dimension,dimensionLength,biasCorrected,allocationBuffer,reductionBuffer,tadOnlyShapeInfo,tadOffsets);
 }
 
@@ -103,12 +103,12 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
 			 */
         template<typename X, typename Z>
         template<typename OpType>
-        _CUDA_D void SummaryStatsReduce<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo, 
-                                                        void *vextraParams, 
-                                                        void *vz, Nd4jLong *zShapeInfo, 
-                                                        int *dimension, int dimensionLength, 
-                                                        int postProcessOrNot, 
-                                                        int *allocationBuffer, void *vreductionBuffer, 
+        _CUDA_D void SummaryStatsReduce<X,Z>::transform(void *vx, Nd4jLong *xShapeInfo,
+                                                        void *vextraParams,
+                                                        void *vz, Nd4jLong *zShapeInfo,
+                                                        int *dimension, int dimensionLength,
+                                                        int postProcessOrNot,
+                                                        int *allocationBuffer, void *vreductionBuffer,
                                                         Nd4jLong *tadOnlyShapeInfo, Nd4jLong *tadOffsets) {
 
             auto dx = static_cast<X*>(vx);
@@ -204,7 +204,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                         sPartials[threadIdx.x] = val;
 
                         for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
-                            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+                            auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
                             SummaryStatsData<X> indexVal2;
                             indexVal2.initWithValue(dx[xOffset]);
 
@@ -264,8 +264,8 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
                 else {
 
                     for (Nd4jLong i = tid; i < n; i += blockDim.x * gridDim.x) {
-                        
-                        auto offset = shape::getIndexOffset(i, xShapeInfo, n);                        
+
+                        auto offset = shape::getIndexOffset(i, xShapeInfo);
                         SummaryStatsData<X> indexVal2;
                         indexVal2.initWithValue(dx[offset]);
                         reduction = update(reduction, indexVal2, extraParams);
@@ -279,7 +279,7 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
 
                 if (gridDim.x > 1) {
                     __shared__ bool amLast;
-                    unsigned int *tc = (unsigned int *)reductionBuffer;                    
+                    unsigned int *tc = (unsigned int *)reductionBuffer;
                     tid = threadIdx.x;
                     if (threadIdx.x == 0) {
                         SummaryStatsData<X> *pBuffer = (SummaryStatsData<X>*) reductionBuffer;
@@ -338,9 +338,9 @@ void _CUDA_G summaryStatsReduceT(int op, void *dx, Nd4jLong *xShapeInfo, int xRa
 
         template <typename X, typename Z>
         _CUDA_H void SummaryStatsReduce<X,Z>::execSummaryStatsReduceScalar(dim3& launchDims, cudaStream_t *stream, int opNum, void *vx, Nd4jLong *xShapeInfo, Nd4jLong *hxShapeInfo, void *vextraParams, void *vz, Nd4jLong *zShapeInfo, Nd4jLong *hzShapeInfo, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets, bool biasCorrected, void *reductionBuffer) {
-            
+
             auto x = static_cast<X*>(vx);
-            auto extraParams = static_cast<Z*>(vextraParams);                                        
+            auto extraParams = static_cast<Z*>(vextraParams);
             auto z = reinterpret_cast<Z*>(vz);
             auto reductionPointerA = reinterpret_cast<Z*>(reductionBuffer);
 
diff --git a/libnd4j/include/loops/cuda/transform/transform_any.cu b/libnd4j/include/loops/cuda/transform/transform_any.cu
index 34f56380a..18b53cea7 100644
--- a/libnd4j/include/loops/cuda/transform/transform_any.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_any.cu
@@ -36,7 +36,7 @@ __global__ void transformAnySimple(void *x, Nd4jLong *xShapeInfo, int xRank,
 								int *allocationPointer,
 								void *reductionPointer,
 								Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-	
+
 	functions::transform::TransformAny<X,Z>::template transformCuda<OpType>(x,xShapeInfo,params,z,zShapeInfo,allocationPointer,reductionPointer,tadShapeInfo, tadOffsets);
 }
 
@@ -57,7 +57,7 @@ namespace functions {
         __device__ void TransformAny<X,Z>::transformCuda(void *vx, Nd4jLong *xShapeInfo,
         												void *vparams,
         												void *vz, Nd4jLong *zShapeInfo,
-        												int *allocationPointer, void *vreductionPointer, 
+        												int *allocationPointer, void *vreductionPointer,
         												Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
         	auto x = reinterpret_cast<X*>(vx);
@@ -70,9 +70,9 @@ namespace functions {
         	__shared__ char xOrder;
             __shared__ char zOrder;
             __shared__ Nd4jLong length;
-            
+
 	        if (threadIdx.x == 0) {
-    	                               	                            
+
         		xEws = shape::elementWiseStride(xShapeInfo);
             	zEws = shape::elementWiseStride(zShapeInfo);
                 xOrder = shape::order(xShapeInfo);
@@ -84,26 +84,26 @@ namespace functions {
 	    	auto tid = blockIdx.x * blockDim.x + threadIdx.x;
 			int totalThreads = gridDim.x * blockDim.x;
 
-		    if(xEws > 0 && zEws > 0 && xOrder == zOrder) {								
-					
+		    if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
+
 				for (int i = tid; i < length; i += totalThreads)
-					z[i * zEws] = OpType::op(x[i * xEws], params);				
+					z[i * zEws] = OpType::op(x[i * xEws], params);
 		    }
-		    else {			        
+		    else {
 				if(vx == vz) {
 					for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-						auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);						
+						auto xOffset = shape::getIndexOffset(i, xShapeInfo);
 	    		    	z[xOffset] = OpType::op(x[xOffset], params);
-		    	   	}		    	    
+		    	   	}
 				}
 				else {
 		    	   	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-						auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
-						auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);				        
+						auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+						auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 	    		    	z[zOffset] = OpType::op(x[xOffset], params);
 		    	   	}
 		    	}
-		    }	        
+		    }
 	    };
 
 
diff --git a/libnd4j/include/loops/cuda/transform/transform_bool.cu b/libnd4j/include/loops/cuda/transform/transform_bool.cu
index 52e6b4a10..e88a4274b 100644
--- a/libnd4j/include/loops/cuda/transform/transform_bool.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_bool.cu
@@ -68,16 +68,16 @@ namespace functions {
 		    if(OpType::requiresSpecial) {
 			    OpType::execSpecialCuda(x,xShapeInfo,z,zShapeInfo,params, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
 			    return;
-		    } 
+		    }
 		    else {
 		    	__shared__ Nd4jLong xEws;
     	        __shared__ Nd4jLong zEws;
         	    __shared__ char xOrder;
             	__shared__ char zOrder;
             	__shared__ Nd4jLong length;
-            
+
 	            if (threadIdx.x == 0) {
-    	                               	                            
+
         	        xEws = shape::elementWiseStride(xShapeInfo);
             	    zEws = shape::elementWiseStride(zShapeInfo);
                 	xOrder = shape::order(xShapeInfo);
@@ -87,28 +87,28 @@ namespace functions {
             	__syncthreads();
 
 	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-				int totalThreads = gridDim.x * blockDim.x;                
+				int totalThreads = gridDim.x * blockDim.x;
+
+		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
 
-		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {								
-					
 					for (int i = tid; i < length; i += totalThreads)
-						z[i * zEws] = OpType::op(x[i * xEws], params);				
+						z[i * zEws] = OpType::op(x[i * xEws], params);
 		        }
-		        else {			        
+		        else {
 					if(vx == vz) {
 						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);						
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
-		    	    	}		    	    
+		    	    	}
 					}
 					else {
 		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
-							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);				        
-	    			    	z[zOffset] = OpType::op(x[xOffset], params); 							
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+							auto zOffset = shape::getIndexOffset(i, zShapeInfo);
+	    			    	z[zOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 		    		}
-		        }		       
+		        }
 	        }
 	    };
 
diff --git a/libnd4j/include/loops/cuda/transform/transform_float.cu b/libnd4j/include/loops/cuda/transform/transform_float.cu
index 6fe7b18d1..44ddb0246 100644
--- a/libnd4j/include/loops/cuda/transform/transform_float.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_float.cu
@@ -35,7 +35,7 @@ __global__ void transformFloatSimple(void *x, Nd4jLong *xShapeInfo, int xRank,
 								int *allocationPointer,
 								void *reductionPointer,
 								Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-	
+
 	functions::transform::TransformFloat<X,Z>::template transformCuda<OpType>(
 	    											x, xShapeInfo,
 	    											params,
@@ -64,7 +64,7 @@ namespace functions {
 											void *vparams,
 											void *vz,
 											Nd4jLong *zShapeInfo,
-											int *allocationPointer, void *vreductionPointer, 
+											int *allocationPointer, void *vreductionPointer,
 											Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
         	auto x = reinterpret_cast<X*>(vx);
@@ -75,7 +75,7 @@ namespace functions {
 		    if(OpType::requiresSpecial) {
 			    OpType::execSpecialCuda(x,xShapeInfo,z,zShapeInfo,params, allocationPointer, reductionPointer, tadShapeInfo, tadOffsets);
 			    return;
-		    } 
+		    }
 		    else {
 
 	            __shared__ Nd4jLong xEws;
@@ -83,9 +83,9 @@ namespace functions {
         	    __shared__ char xOrder;
             	__shared__ char zOrder;
             	__shared__ Nd4jLong length;
-            
+
 	            if (threadIdx.x == 0) {
-    	                               	                            
+
         	        xEws = shape::elementWiseStride(xShapeInfo);
             	    zEws = shape::elementWiseStride(zShapeInfo);
                 	xOrder = shape::order(xShapeInfo);
@@ -95,24 +95,24 @@ namespace functions {
             	__syncthreads();
 
 	    	    auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-				int totalThreads = gridDim.x * blockDim.x;                
+				int totalThreads = gridDim.x * blockDim.x;
+
+		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {
 
-		        if(xEws > 0 && zEws > 0 && xOrder == zOrder) {								
-					
 					for (Nd4jLong i = tid; i < length; i += totalThreads)
                         z[i * zEws] = OpType::op(x[i * xEws], params);
 		        }
-		        else {			        
+		        else {
 					if(vx == vz) {
 						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);						
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
-		    	    	}		    	    
+		    	    	}
 					}
 					else {
 		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
-							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);				        
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+							auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 	    			    	z[zOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 		    		}
diff --git a/libnd4j/include/loops/cuda/transform/transform_same.cu b/libnd4j/include/loops/cuda/transform/transform_same.cu
index 6c533ac3a..e59381fba 100644
--- a/libnd4j/include/loops/cuda/transform/transform_same.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_same.cu
@@ -95,14 +95,14 @@ namespace functions {
 		        else {
 					if(vx == vz) {
 						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 					}
 					else {
 		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
-							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+							auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 	    			    	z[zOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 		    		}
diff --git a/libnd4j/include/loops/cuda/transform/transform_strict.cu b/libnd4j/include/loops/cuda/transform/transform_strict.cu
index a0989b0e6..0befdf35f 100644
--- a/libnd4j/include/loops/cuda/transform/transform_strict.cu
+++ b/libnd4j/include/loops/cuda/transform/transform_strict.cu
@@ -35,7 +35,7 @@ __global__ void transformStrictSimple(void *x, Nd4jLong *xShapeInfo, int xRank,
 								int *allocationPointer,
 								void *reductionPointer,
 								Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
-	
+
 	functions::transform::TransformStrict<X>::template transformCuda<OpType>(x,xShapeInfo,params,z,zShapeInfo,allocationPointer,reductionPointer,tadShapeInfo, tadOffsets);
 }
 
@@ -97,14 +97,14 @@ namespace functions {
 		        else {
 					if(vx == vz) {
 						for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
 	    			    	z[xOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 					}
 					else {
 		    	    	for (Nd4jLong i = tid; i < length; i+= totalThreads) {
-							auto xOffset = shape::getIndexOffset(i, xShapeInfo,  length);
-							auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
+							auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+							auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 	    			    	z[zOffset] = OpType::op(x[xOffset], params);
 		    	    	}
 		    		}
diff --git a/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp b/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp
index 6370579d2..d507a5b8d 100644
--- a/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/convo/conv3d.cpp
@@ -24,6 +24,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/addBias.h>
 #include <MmulHelper.h>
 
 namespace nd4j {
@@ -162,7 +163,8 @@ CUSTOM_OP_IMPL(conv3dnew, 2, 1, false, 0, 13) {
     MmulHelper::tensorDot(&columns, weights, output, {1,2,3,4}, {3,0,1,2}, permutForOutput);
 
     if(bias)
-        output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        helpers::addBias(block, *output, *bias, *output, isNCDHW);
 
      if(!isNCDHW)
         delete input;
diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp b/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp
index 3a7450fc7..f69b6c0f9 100644
--- a/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp
+++ b/libnd4j/include/ops/declarable/generic/convo/deconv2d.cpp
@@ -27,7 +27,7 @@
 #include <declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/im2col.h>
 #include <ops/declarable/helpers/col2im.h>
-
+#include <ops/declarable/helpers/addBias.h>
 
 namespace nd4j {
 namespace ops  {
@@ -80,7 +80,8 @@ CUSTOM_OP_IMPL(deconv2d, 2, 1, false, 0, 9) {
 
     //----- add biases if required -----//
     if(bias)
-        output->applyBroadcast(broadcast::Add, {1}, bias);
+        // output->applyBroadcast(broadcast::Add, {1}, bias);
+        helpers::addBias(block, *output, *bias, *output, true);
 
      if(!isNCHW)
         delete output;
diff --git a/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp b/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp
index 20d0e991e..f875e4693 100644
--- a/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp
+++ b/libnd4j/include/ops/declarable/generic/convo/deconv3d.cpp
@@ -23,6 +23,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/convolutions.h>
+#include <ops/declarable/helpers/addBias.h>
 #include <MmulHelper.h>
 
 namespace nd4j {
@@ -79,7 +80,8 @@ CUSTOM_OP_IMPL(deconv3d, 2, 1, false, 0, 13) {
 
     //----- add biases if required -----//
     if(bias)
-        output->applyBroadcast(broadcast::Add,{1}, bias);
+        // output->applyBroadcast(broadcast::Add,{1}, bias);
+        helpers::addBias(block, *output, *bias, *output, true);
 
     if(!isNCDHW)
         delete output;
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
index 3309c6104..f5a65079a 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/bias_add.cpp
@@ -15,107 +15,111 @@
  ******************************************************************************/
 
 //
-//  @author raver119@gmail.com
+// @author raver119@gmail.com
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include <op_boilerplate.h>
 #if NOT_EXCLUDED(OP_biasadd)
 
 #include <ops/declarable/CustomOperations.h>
+#include<ops/declarable/helpers/addBias.h>
 
 namespace nd4j {
-    namespace ops {
-        DECLARE_TYPES(biasadd) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes({ALL_FLOATS});
-        }
+namespace ops {
 
-        CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) {
-            //REQUIRE_OK(this->validateInput2D(block));
-            auto input = INPUT_VARIABLE(0);
-            auto bias = INPUT_VARIABLE(1);
+////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(biasadd, 2, 1, true, 0, 0) {
 
-            REQUIRE_TRUE(bias->isRowVector(), 0, "Bias array should be a vector");
+    auto input = INPUT_VARIABLE(0);
+    auto bias = INPUT_VARIABLE(1);
 
-            auto z = OUTPUT_VARIABLE(0);
+    auto output = OUTPUT_VARIABLE(0);
 
-            if (input->isMatrix())
-                input->addRowVector(bias, z);
-            else {
-                // TODO: we might want to use NDArray::applyTrueBroadcast here, like AddOp does
-                std::vector<Nd4jLong> shape({-1, bias->lengthOf()});
-                //nd4j_debug("Reshaping to: [%i, %i]\n", -1, (int) bias->lengthOf());
-                auto tArr = input->reshape(input->ordering(), shape);
-                auto zArr = z->reshape(z->ordering(), shape);
-                tArr.addRowVector(bias, &zArr);
-            }
+    const bool isNCHW = !block.getBArguments()->empty() ? B_ARG(0) : false;
+    const int channelDim = isNCHW ? 1 : input->rankOf() - 1;      // second or last
 
-            STORE_RESULT(*z);
+    REQUIRE_TRUE(bias->rankOf() == 1, 0, "BIASADD CUSTOM_OP: bias array should have rank = 1, but got %i instead !", bias->rankOf());
 
-            return Status::OK();
-        }
-        DECLARE_SYN(bias_add, biasadd);
+    REQUIRE_TRUE(bias->sizeAt(0) == input->sizeAt(channelDim), 0, "BIASADD CUSTOM_OP: shapes of bias %s and input %s arrays are not suitable for broadcast operation along channel dimension %i !", ShapeUtils::shapeAsString(bias).c_str(), ShapeUtils::shapeAsString(input).c_str(), channelDim);
 
-        DECLARE_SHAPE_FN(biasadd) {
-            auto xShape = inputShape->at(0);
-            auto yShape = inputShape->at(1);
+    REQUIRE_TRUE(output->isSameShape(input), 0, "BIASADD CUSTOM_OP: wrong shape of output array, expected is %s but got %s instead !", ShapeUtils::shapeAsString(input).c_str(), ShapeUtils::shapeAsString(output).c_str());
 
-            auto dtype = ArrayOptions::dataType(yShape);
-            return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(xShape, dtype)));
-        }
+    helpers::addBias(block, *input, *bias, *output, isNCHW);
+    // input->applyBroadcast(nd4j::broadcast::Add, {channelDim}, bias, output);
 
-        DECLARE_TYPES(biasadd_bp) {
-            getOpDescriptor()
-                    ->setAllowedInputTypes(nd4j::DataType::ANY)
-                    ->setAllowedOutputTypes({ALL_FLOATS});
-        }
+    return Status::OK();
+}
+DECLARE_SYN(bias_add, biasadd);
 
-        CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) {
-            auto input = INPUT_VARIABLE(0);
-            auto bias = INPUT_VARIABLE(1);
-            auto epsilonNext = INPUT_VARIABLE(2);
+////////////////////////////////////////////////////////////////////
+DECLARE_SHAPE_FN(biasadd) {
+    auto xShape = inputShape->at(0);
+    auto yShape = inputShape->at(1);
 
-            auto epsilon = OUTPUT_VARIABLE(0);
-            auto gradB = OUTPUT_VARIABLE(1);
+    auto dtype = ArrayOptions::dataType(yShape);
+    return SHAPELIST(ConstantShapeHelper::getInstance()->createShapeInfo(ShapeDescriptor(xShape, dtype)));
+}
 
-            epsilon->assign(epsilonNext);
+DECLARE_TYPES(biasadd) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedOutputTypes({ALL_FLOATS});
+}
 
-            // cnn case
-            if (input->rankOf() == 4) {
-                auto epsilonNext2d = epsilonNext->permute({1, 0, 2, 3});
-                epsilonNext2d.reshapei('c', {(int) bias->lengthOf(), -1});
+////////////////////////////////////////////////////////////////////
+CUSTOM_OP_IMPL(biasadd_bp, 3, 2, false, 0, 0) {
+    auto input = INPUT_VARIABLE(0);
+    auto bias = INPUT_VARIABLE(1);
+    auto epsilonNext = INPUT_VARIABLE(2);
 
-                auto sum = epsilonNext2d.reduceAlongDimension(reduce::Sum, {1});
-                gradB->assign(sum);
+    auto epsilon = OUTPUT_VARIABLE(0);
+    auto gradB = OUTPUT_VARIABLE(1);
 
-                delete sum;
-            } else if (input->rankOf() == 2) {
-                // regular fully-connected case
-                auto sum = epsilonNext->reduceAlongDimension(reduce::Sum, {0});
-                gradB->assign(sum);
-                
-                delete sum;
-            }
+    epsilon->assign(epsilonNext);
 
-            return ND4J_STATUS_OK;
-        }
-        DECLARE_SYN(BiasAddGrad, biasadd_bp);
+    // cnn case
+    if (input->rankOf() == 4) {
+        auto epsilonNext2d = epsilonNext->permute({1, 0, 2, 3});
+        epsilonNext2d.reshapei('c', {(int) bias->lengthOf(), -1});
 
-        DECLARE_SHAPE_FN(biasadd_bp) {
-            auto input = inputShape->at(0);
-            auto bias = inputShape->at(1);
+        auto sum = epsilonNext2d.reduceAlongDimension(reduce::Sum, {1});
+        gradB->assign(sum);
 
-            Nd4jLong* epsShape;
-            Nd4jLong* gradShape;
+        delete sum;
+    } else if (input->rankOf() == 2) {
+        // regular fully-connected case
+        auto sum = epsilonNext->reduceAlongDimension(reduce::Sum, {0});
+        gradB->assign(sum);
 
-            COPY_SHAPE(input, epsShape);
-            COPY_SHAPE(bias, gradShape);
-
-            return SHAPELIST(CONSTANT(epsShape), CONSTANT(gradShape));
-
-        }
+        delete sum;
     }
+
+    return ND4J_STATUS_OK;
+}
+DECLARE_SYN(BiasAddGrad, biasadd_bp);
+
+DECLARE_SHAPE_FN(biasadd_bp) {
+    auto input = inputShape->at(0);
+    auto bias = inputShape->at(1);
+
+    Nd4jLong* epsShape;
+    Nd4jLong* gradShape;
+
+    COPY_SHAPE(input, epsShape);
+    COPY_SHAPE(bias, gradShape);
+
+    return SHAPELIST(CONSTANT(epsShape), CONSTANT(gradShape));
+}
+
+DECLARE_TYPES(biasadd_bp) {
+    getOpDescriptor()
+            ->setAllowedInputTypes(nd4j::DataType::ANY)
+            ->setAllowedOutputTypes({ALL_FLOATS});
+}
+
+
+}
 }
 
 #endif
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
index c430fd4d2..c3e73da84 100644
--- a/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
+++ b/libnd4j/include/ops/declarable/generic/parity_ops/matrix_diag.cpp
@@ -43,14 +43,15 @@ DECLARE_SHAPE_FN(matrix_diag) {
     auto in = inputShape->at(0);
     int inRank = shape::rank(in);
 
+    //  if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C]
+
     int outRank = inRank + 1;
-    auto lastDimension = shape::sizeAt(in, -1);
 
     ALLOCATE(outShapeInfo, block.getWorkspace(), shape::shapeInfoLength(outRank), Nd4jLong);
     outShapeInfo[0] = outRank;
     for(int i = 0; i < inRank; ++i)
         outShapeInfo[i + 1] = shape::sizeAt(in, i);
-    outShapeInfo[outRank] = lastDimension;
+    outShapeInfo[outRank] = shape::sizeAt(in, -1);
 
     ShapeUtils::updateStridesAndType(outShapeInfo, in, shape::order(in));
 
diff --git a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
index 8ab5fa32f..06656b9de 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/layer_norm.cpp
@@ -23,7 +23,7 @@
 
 #include <ops/declarable/CustomOperations.h>
 #include <ops/declarable/helpers/reverse.h>
-
+#include <ops/declarable/helpers/addBias.h>
 
 namespace nd4j {
 namespace ops  {
@@ -59,7 +59,8 @@ namespace ops  {
         output->applyBroadcast(nd4j::broadcast::Multiply, {dimC}, gain);
         if(bias != nullptr) {
             // output->applyTrueBroadcast(nd4j::BroadcastOpsTuple::Add(), bias, output);
-            output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias);
+            // output->applyBroadcast(nd4j::broadcast::Add, {dimC}, bias);
+            helpers::addBias(block, *output, *bias, *output, isNCHW);
         }
 
         return Status::OK();
diff --git a/libnd4j/include/ops/declarable/headers/parity_ops.h b/libnd4j/include/ops/declarable/headers/parity_ops.h
index bb7f306bd..e30ff86a5 100644
--- a/libnd4j/include/ops/declarable/headers/parity_ops.h
+++ b/libnd4j/include/ops/declarable/headers/parity_ops.h
@@ -79,36 +79,44 @@ namespace nd4j {
         * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of input array
         *
         * Input arrays:
-        *    input:    input array, considered as batch of matrices
-        *    diagonal: array containing elements to be inserted into input array,
-        *              following rank condition should be satisfied: diagonal_rank = input_rank - 1,
-        *              the shapes of diagonal and input arrays must be equal except last dimension of input array,
-        *              for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
-        *              also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
-        *              that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
+        *  0: input array, considered as batch of matrices
+        *  1: diagonal array containing elements to be inserted into input array,
+        *     following rank condition should be satisfied: diagonal_rank = input_rank - 1,
+        *     the shapes of diagonal and input arrays must be equal except last dimension of input array,
+        *     for example if input_shape = [A,B,C,D] then diagonal_shape = [A,B,C],
+        *     also last dimension of diagonal array should be equal to smaller of last and last but one input dimensions
+        *     that is: diagonal_shape[-1] = min(input_shape[-1], input_shape[-2])
         *
         * Output array:
-        *    has the same shape as input, corresponding diagonal elements are substituted
+        *  0: has the same shape as input, corresponding diagonal elements are substituted
         */
         #if NOT_EXCLUDED(OP_matrix_set_diag)
         DECLARE_CONFIGURABLE_OP(matrix_set_diag, 2, 1, false, 0, 0);
         #endif
 
         /**
-         * Returns a batched matrix tensor with diagonal values given (as TF.matrix_diag).
-         */
+        * Inserts elements provided by diagonal array into the main diagonal of innermost matrices of output array,
+        * rest output elements are set to zeros
+        *
+        * Input array:
+        *    diagonal: array containing elements to be inserted into output array,
+        *              following rank condition is present: diagonal_rank = ouput_rank - 1
+        *
+        * Output array:
+        *   0: is considered as batch of matrices, if for example diagonal array has shape [A,B,C] then output array has shape [A,B,C,C]
+        */
         DECLARE_CUSTOM_OP(matrix_diag, 1, 1, false, 0, 0);
 
         /**
         * This op calculates regularized incomplete beta integral Ix(a, b).
         * Implementation is based on two algorithms depending on input values of a and b:
-        * - when a and b are both >  maxValue (3000.), then apply Gauss-Legendre quadrature method
-        * - when a and b are both <= maxValue (3000.), then apply modified Lentz’s algorithm for continued fractions
+        * - when a and b are both >  maxValue (3000.), then Gauss-Legendre quadrature method is applied
+        * - when a and b are both <= maxValue (3000.), then modified Lentz’s algorithm for continued fractions is applied
         *
         * Input arrays:
-        *    a: define power t^{a-1}, must be > 0, type float.
-        *    b: define power (1-t)^{b-1}, must be > 0, type float.
-        *    x: define upper limit of integration, must be within (0 <= x <= 1) range, type float.
+        *    a: defines power t^{a-1}, must be > 0, type float.
+        *    b: defines power (1-t)^{b-1}, must be > 0, type float.
+        *    x: defines upper limit of integration, must be within (0 <= x <= 1) range, type float.
         *
         * Output array:
         *    0: values of  regularized incomplete beta integral that corresponds to variable upper limit x, type float
diff --git a/libnd4j/include/ops/declarable/helpers/addBias.h b/libnd4j/include/ops/declarable/helpers/addBias.h
index 3d9fdec88..c754c07de 100644
--- a/libnd4j/include/ops/declarable/helpers/addBias.h
+++ b/libnd4j/include/ops/declarable/helpers/addBias.h
@@ -22,14 +22,15 @@
 #define LIBND4J_ADDBIAS_H
 
 #include <ops/declarable/helpers/helpers.h>
+#include <graph/Context.h>
 
 namespace nd4j    {
 namespace ops     {
 namespace helpers {
 
 
-	void addBias(NDArray& input, const NDArray& bias, const bool isNCHW);
-    
+	void addBias(graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW);
+
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
index fbc395fc6..bd29094ec 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/activations.cpp
@@ -91,19 +91,19 @@ static void softMaxForVector_(void *input, Nd4jLong *inShapeInfo, void *output,
 
 PRAGMA_OMP_SIMD_ARGS(reduction(OMP_MAXT:max))
         for (int i = 0; i < length; i++) {
-            const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
+            const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             max = nd4j::math::nd4j_max<T>(max, inBuff[offset]);
         }
 
 PRAGMA_OMP_PARALLEL_FOR_SIMD_ARGS(reduction(OMP_SUMT:sum))
         for (int i = 0; i < length; i++) {
-            const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
+            const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             outBuff[offset] = nd4j::math::nd4j_exp<T, T>(inBuff[offset] - max);
             sum += outBuff[offset];
         }
 PRAGMA_OMP_SIMD
         for (int i = 0; i < length; i++) {
-            const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo, length);
+            const Nd4jLong offset = shape::getIndexOffset(i, inShapeInfo);
             outBuff[offset] /= sum;
             outBuff[offset] *= (1.f - outBuff[offset]);     // derivative
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
index 09cdf5d4e..0e6e1f777 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/addBias.cpp
@@ -28,70 +28,116 @@ namespace helpers {
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-static void addBias_(NDArray& input, const NDArray& bias, const bool isNCHW) {
+static void addBias_(const NDArray& input, const NDArray& bias, NDArray &output, const bool isNCHW) {
 
-    // input  [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
-    // bias   [oC]
+    // bias [oC]
 
-          X* inBuff   = input.bufferAsT<X>();
-    const Y* biasBuff = bias.bufferAsT<Y>();    
+    // if(input_rank == 4)
+        // input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+    // if(input_rank == 5)
+        // input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW)
+    // else
+        // apply applyBroadCast
 
-    int bS, iC, iH, iW, oC, oH, oW;                             // batch size, input channels, input height/width, output channels, output height/width;
-    bS = input.sizeAt(0);    
-    const Nd4jLong stride0 = input.stridesOf()[0];
-    const Nd4jLong stride1 = input.stridesOf()[1];
-    const Nd4jLong stride2 = input.stridesOf()[2];
 
-    uint biasShapeInfoCast[MAX_RANK];    
-    bool canCastBias = nd4j::DataTypeUtils::castShapeInfo(bias.getShapeInfo(), biasShapeInfoCast);
-    
-    if(isNCHW) {
-        
-        oC = input.sizeAt(1);
-        oH = input.sizeAt(2);
-        oW = input.sizeAt(3);
+    const X* x = input.bufferAsT<X>();
+    const Y* y = bias.bufferAsT<Y>();
+          X* z = output.bufferAsT<X>();
 
-        const int oHoW = oH*oW;
+    const bool inOutAreSame = x == z;
 
-        PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
-        for (int i = 0; i < bS; ++i) {
-            for (int c = 0; c < oC; ++c) {
-                
-                auto biasOffset = shape::indexOffset(c, bias.getShapeInfo(), biasShapeInfoCast, oC, canCastBias);
-                auto inOffset = i * stride0 + c * stride1;
+    const uint bS           = output.sizeAt(0);              // batch size
+    const Nd4jLong yStrideC = bias.stridesOf()[0];
+    const Nd4jLong zStrideB = output.stridesOf()[0];
 
-                PRAGMA_OMP_SIMD
-                for (uint k = 0; k < oHoW; ++k)
-                    inBuff[inOffset + k] += static_cast<X>(biasBuff[biasOffset]);
-            }
+    if(output.rankOf() == 4) {
+
+        const uint C  = isNCHW ? output.sizeAt(1) : output.sizeAt(3);     // channels
+        const uint oH = isNCHW ? output.sizeAt(2) : output.sizeAt(1);     // height
+        const uint oW = isNCHW ? output.sizeAt(3) : output.sizeAt(2);     // width
+
+        const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[3];
+        const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1];
+        const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2];
+
+        if(inOutAreSame) {
+
+            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
+            for(uint b = 0; b < bS; ++b)
+                for(uint c = 0; c < C; ++c)
+                    for(uint h = 0; h < oH ; ++h)
+                        for(uint w = 0; w < oW ; ++w)
+                            z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
+        }
+        else {
+
+            const Nd4jLong xStrideB = input.stridesOf()[0];
+            const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[3];
+            const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1];
+            const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
+
+            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(4))
+            for(uint b = 0; b < bS; ++b)
+                for(uint c = 0; c < C; ++c)
+                    for(uint h = 0; h < oH ; ++h)
+                        for(uint w = 0; w < oW ; ++w)
+                            z[b*zStrideB + c*zStrideC + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + h*xStrideH + w*xStrideW] +  static_cast<X>(y[c*yStrideC]);
+        }
+    }
+    else if(output.rankOf() == 5) {
+
+        const uint C  = isNCHW ? output.sizeAt(1) : output.sizeAt(4);     // channels
+        const uint oD = isNCHW ? output.sizeAt(2) : output.sizeAt(1);     // depth
+        const uint oH = isNCHW ? output.sizeAt(3) : output.sizeAt(2);     // height
+        const uint oW = isNCHW ? output.sizeAt(4) : output.sizeAt(3);     // width
+
+        const Nd4jLong zStrideC = isNCHW ? output.stridesOf()[1] : output.stridesOf()[4];
+        const Nd4jLong zStrideD = isNCHW ? output.stridesOf()[2] : output.stridesOf()[1];
+        const Nd4jLong zStrideH = isNCHW ? output.stridesOf()[3] : output.stridesOf()[2];
+        const Nd4jLong zStrideW = isNCHW ? output.stridesOf()[4] : output.stridesOf()[3];
+
+        if(inOutAreSame) {
+
+            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
+            for(uint b = 0; b < bS; ++b)
+                for(uint c = 0; c < C; ++c)
+                    for(uint d = 0; d < oD ; ++d)
+                        for(uint h = 0; h < oH ; ++h)
+                            for(uint w = 0; w < oW ; ++w)
+                                z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] += static_cast<X>(y[c*yStrideC]);
+        }
+        else {
+
+            const Nd4jLong xStrideB = input.stridesOf()[0];
+            const Nd4jLong xStrideC = isNCHW ? input.stridesOf()[1] : input.stridesOf()[4];
+            const Nd4jLong xStrideD = isNCHW ? input.stridesOf()[2] : input.stridesOf()[1];
+            const Nd4jLong xStrideH = isNCHW ? input.stridesOf()[3] : input.stridesOf()[2];
+            const Nd4jLong xStrideW = isNCHW ? input.stridesOf()[4] : input.stridesOf()[3];
+
+            PRAGMA_OMP_PARALLEL_FOR_ARGS(collapse(5))
+            for(uint b = 0; b < bS; ++b)
+                for(uint c = 0; c < C; ++c)
+                    for(uint d = 0; d < oD ; ++d)
+                        for(uint h = 0; h < oH ; ++h)
+                            for(uint w = 0; w < oW ; ++w)
+                                z[b*zStrideB + c*zStrideC + d*zStrideD + h*zStrideH + w*zStrideW] = x[b*xStrideB + c*xStrideC + d*xStrideD + h*xStrideH + w*xStrideW] + static_cast<X>(y[c*yStrideC]);
         }
     }
     else {
-        
-        oC = input.sizeAt(3);
-        oH = input.sizeAt(1);
-        oW = input.sizeAt(2);
-
-        PRAGMA_OMP_PARALLEL_FOR
-        for (int i = 0; i < bS*oH*oW; ++i) {
-
-            PRAGMA_OMP_SIMD
-            for (int c = 0; c < oC; ++c) {
-                auto biasOffset = shape::indexOffset(c, bias.getShapeInfo(), biasShapeInfoCast, oC, canCastBias);
-                inBuff[i * oC + c] += static_cast<X>(biasBuff[biasOffset]);
-            }                            
-        }
-    }        
+         const int channelDim = isNCHW ? 1 : input.rankOf() - 1;      // second or last
+         const_cast<NDArray&>(input).applyBroadcast(nd4j::broadcast::Add, {channelDim}, &bias, &output);
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////
-void addBias(NDArray& input, const NDArray& bias, const bool isNCHW) {
+void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
 
-    BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, isNCHW), FLOAT_TYPES, FLOAT_TYPES);
+    // bias.rankOf() == 1 ? bias : bias.reshape(bias.ordering(), {bias.lengthOf()})
+    BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBias_, (input, bias, output, isNCHW), FLOAT_TYPES, FLOAT_TYPES);
 }
 
 
-BUILD_DOUBLE_TEMPLATE(template void addBias_, (NDArray& input, const NDArray& bias, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES);
+BUILD_DOUBLE_TEMPLATE(template void addBias_, (const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW), FLOAT_TYPES, FLOAT_TYPES);
 
 }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
index 0c12a2896..d6c4da4a1 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/batchnorm.cpp
@@ -84,7 +84,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
                 const Nd4jLong end   = start + step;
 
                 // calculate offset for mean, variance, gamma, beta (all of them have the same shape)
-                auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, lenSmall, canCastMean);
+                auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, canCastMean);
                 // calculate offset for input and output (all of them have the same shape)
                 shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data());
 
@@ -114,7 +114,7 @@ static void batchnorm_(const NDArray* input, const NDArray* mean, const NDArray*
                 const Nd4jLong end   = start + step;
 
                 // calculate offset for mean, variance, gamma, beta (all of them have the same shape)
-                auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, lenSmall, canCastMean);
+                auto offsetSmall = shape::indexOffset(j, meanShapeInfo, meanShapeInfoCast, canCastMean);
                 // calculate offset for input and output (all of them have the same shape)
                 shape::outerArrayOffsets(inOffsets, j, inShapeInfo, meanShapeInfo, memBuff, dimsToExclude.data());
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
index 681b4eb63..bba3e8acb 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/betaInc.cpp
@@ -29,7 +29,7 @@ namespace helpers {
 
 ///////////////////////////////////////////////////////////////////
 // modified Lentz’s algorithm for continued fractions,
-// reference: Lentz, W.J. 1976, “Generating Bessel Functions in Mie Scattering Calculations Using Continued Fractions,”
+// reference: Lentz, W.J. 1976, “Generating Bessel Functions in Mie Scattering Calculations Using Continued Fractions”
 template <typename T>
 static T continuedFraction(const T a, const T b, const T x) {
 
@@ -122,9 +122,8 @@ static void betaIncForArray(nd4j::LaunchContext * context, const NDArray& a, con
 	int xLen = x.lengthOf();
 
     PRAGMA_OMP_PARALLEL_FOR_IF(xLen > Environment::getInstance()->elementwiseThreshold())
-	for(int i = 0; i < xLen; ++i) {
-		output.p(i, betaIncCore<T>(a.e<T>(i), b.e<T>(i), x.e<T>(i)));
-	}
+	for(int i = 0; i < xLen; ++i)
+		output.t<T>(i) = betaIncCore<T>(a.t<T>(i), b.t<T>(i), x.t<T>(i));
 }
 
 ///////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
index 3d04bc129..f096d5bfa 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/convolutions.cpp
@@ -648,7 +648,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
             //----- add biases if required -----//
             if(bias)
                 // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
-                helpers::addBias(*output, *bias, isNCHW);
+                helpers::addBias(block, *output, *bias, *output, isNCHW);
 
             if(!isNCHW)
                 delete input;
@@ -875,7 +875,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
 
 //////////////////////////////////////////////////////////////////////////
         template <typename X, typename Y>
-        static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
+        static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
 
             // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
             // weights   [kH, kW, iC, mC] always
@@ -922,7 +922,8 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
             MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
 
             if(bias)
-                output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+                // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+                helpers::addBias(block, *output, *bias, *output, isNCHW);
 
             if(!isNCHW)
                 delete input;
@@ -2451,7 +2452,7 @@ void ConvolutionUtils::getMKLDNNMemoryDescConv3d(
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), conv2dBP_, (block, input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
         }
         void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
-            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
+            BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
         }
         void ConvolutionUtils::depthwiseConv2dBP(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, const NDArray* gradO, NDArray* gradI, NDArray* gradW, NDArray* gradB, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
             BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2dBP_, (input, weights, bias, gradO, gradI, gradW, gradB, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
index 6a08064fc..30d4d3f7a 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/flatten.cpp
@@ -37,24 +37,16 @@ namespace nd4j {
                     cOffset += inputs[e]->lengthOf();
                 }
 
-                Nd4jLong xCoord[MAX_RANK];
-
                 // actually transferring data
                 for (int e = 0; e < numArrays; e++) {
                     auto z = reinterpret_cast<T *>(output->bufferWithOffset(offsets[e]));
 
                     auto xBuffer = inputs[e]->bufferAsT<T>();
                     auto xShapeInfo = inputs[e]->shapeInfo();
-                    auto xShape = shape::shapeOf(xShapeInfo);
-                    auto xStride = shape::stride(xShapeInfo);
-                    auto xRank = shape::rank(xShapeInfo);
                     auto xLength = inputs[e]->lengthOf();
-                    
-                    for (uint i = 0; i < xLength; i++) {
-                        shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
-                        auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
-                        z[i] = xBuffer[xOffset];
-                    }                    
+
+                    for (uint i = 0; i < xLength; i++)
+                        z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
index af4e96e2e..def210457 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/ismax.cpp
@@ -30,7 +30,7 @@ namespace helpers {
 
 template <typename X, typename Z>
 static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>& dimensions) {
-                        
+
     if (input->isVector()) {
         int dimensionsLength = dimensions.size();
         int length = input->lengthOf();
@@ -169,7 +169,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
                         for (int i = 0; i < tadLength; i++) {
                             rZ[i] = maxIdx == i ? (Z) 1 : (Z) 0;
                         }
-                    } 
+                    }
                     else if (tadEWS > 1 && zEWS > 1) {
                         for (int i = 0; i < tadLength; i++) {
                             if (rX[i * tadEWS] > maxValue) {
@@ -184,7 +184,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
                         }
                     } else {
                         for (int i = 0; i < tadLength; i++) {
-                            auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo, tadLength);
+                            auto xOffset = shape::getIndexOffset(i, tadShapeShapeInfo);
                             if (rX[xOffset] > maxValue) {
                                 maxIdx = i;
                                 maxValue = rX[xOffset];
@@ -193,7 +193,7 @@ static void ismax_(const NDArray* input, NDArray* output, const std::vector<int>
 
                         PRAGMA_OMP_SIMD
                         for (int i = 0; i < tadLength; i++) {
-                            auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo(), tadLength);
+                            auto zOffset = shape::getIndexOffset(i, tadPackZ.primaryShapeInfo());
                             rZ[zOffset] = maxIdx == i ? (Z) 1 : (Z) 0;
                         }
                     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
index e974755ac..9a2034fd0 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/matrixSetDiag.cpp
@@ -52,14 +52,14 @@ void matrixSetDiag_(const NDArray& input, const NDArray& diagonal, NDArray& outp
     PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
     for (Nd4jLong i = 0; i < xLen; ++i) {
 
-        shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords.data());
+        shape::index2coords(i, xShapeInfo, coords.data());
 
-        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords.data(), xRank);
-        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords.data(), xRank);
+        const auto xOffset = shape::getOffset(xShapeInfo, coords.data());
+        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords.data());
 
         // condition to be on diagonal of innermost matrix
         if(coords[xRank - 2] == coords[xRank - 1])
-            z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords.data(), xRank - 1)];
+            z[zOffset] = y[shape::getOffset(yShapeInfo, coords.data())];
         else
             z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
index ea273d33b..a83518899 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/one_hot.cpp
@@ -73,12 +73,12 @@ namespace nd4j {
                         if (idx < 0 || idx >= tLen) {
                             PRAGMA_OMP_SIMD
                             for (unsigned int t = 0; t < tLen; t++) {
-                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo(), tLen)] = zero;
+                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = zero;
                             }
                         } else {
                             PRAGMA_OMP_SIMD
                             for (unsigned int t = 0; t < tLen; t++) {
-                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo(), tLen)] = idx == t ? one : zero;
+                                cO[shape::getIndexOffset(t, tadPack.primaryShapeInfo())] = idx == t ? one : zero;
                             }
                         }
                     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
index bd14fbd8d..f46346876 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/prefix.cpp
@@ -53,8 +53,8 @@ namespace nd4j {
 
                         for (Nd4jLong e = length - 1; e >= 0; --e) {
 
-                            auto xOffset = shape::getIndexOffset(e, xShapeInfo, length);
-                            auto zOffset = shape::getIndexOffset(e, zShapeInfo, length);
+                            auto xOffset = shape::getIndexOffset(e, xShapeInfo);
+                            auto zOffset = shape::getIndexOffset(e, zShapeInfo);
                             sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[xOffset]) : simdOps::Multiply<T, T, T>::op(sum, x[xOffset]);
 
                             if (!exclusive)
@@ -83,8 +83,8 @@ namespace nd4j {
 
                         for (int e = 0; e < length; e++) {
 
-                            auto xOffset = shape::getIndexOffset(e, xShapeInfo, length);
-                            auto zOffset = shape::getIndexOffset(e, zShapeInfo, length);
+                            auto xOffset = shape::getIndexOffset(e, xShapeInfo);
+                            auto zOffset = shape::getIndexOffset(e, zShapeInfo);
                             sum = op == scalar::Add ? simdOps::Add<T, T, T>::op(sum, x[xOffset]) : simdOps::Multiply<T, T, T>::op(sum, x[xOffset]);
 
                             if (!exclusive)
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
index 0922a1248..83deeca88 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/reverse.cpp
@@ -60,7 +60,7 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 //                        inArr[e] = inArr[idx];
 //                        inArr[idx] = tmp;
                     }
-                } 
+                }
                 else if (inEWS > 1) {
                     PRAGMA_OMP_PARALLEL_FOR
                     for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
@@ -71,19 +71,19 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
 //                        inArr[idx1] = tmp;
                         swap(inArr, idx1, idx2);
                     }
-                } 
+                }
                 else {
 
                     PRAGMA_OMP_PARALLEL_FOR
                     for (Nd4jLong e = 0; e < numOfElemsToReverse / 2; e++) {
-                   
-                        auto inOffset  = shape::getIndexOffset(e, inShapeBuffer, inLength);
-                        auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer, inLength);
+
+                        auto inOffset  = shape::getIndexOffset(e, inShapeBuffer);
+                        auto outOffset = shape::getIndexOffset(sLength - e, inShapeBuffer);
                         //outArr[outOffset] = inArr[inOffset];
                         swap(outArr, inOffset, outOffset);
                     }
                 }
-            } 
+            }
             else {
                 // single step phase here
                 auto outEWS = shape::elementWiseStride(outShapeBuffer);
@@ -92,15 +92,15 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                 if (inEWS == 1 && outEWS == 1 && inOrder == outOrder) {
 
                     PRAGMA_OMP_PARALLEL_FOR
-                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) 
-                        outArr[sLength - e] = inArr[e];                    
+                    for (Nd4jLong e = 0; e < numOfElemsToReverse; e++)
+                        outArr[sLength - e] = inArr[e];
 
                     if(inLength != numOfElemsToReverse) {
                         PRAGMA_OMP_PARALLEL_FOR
                         for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++)
                             outArr[e] = inArr[e];
                     }
-                } 
+                }
                 else if (inEWS >= 1 && outEWS >= 1 && inOrder == outOrder) {
 
                     PRAGMA_OMP_PARALLEL_FOR
@@ -112,14 +112,14 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                         for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++)
                             outArr[e * outEWS] = inArr[e * inEWS];
                     }
-                } 
+                }
                 else {
 
                     PRAGMA_OMP_PARALLEL_FOR
                     for (Nd4jLong e = 0; e < numOfElemsToReverse; e++) {
 
-                        auto inOffset = shape::getIndexOffset(e, inShapeBuffer, inLength);
-                        auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer, outLength);
+                        auto inOffset = shape::getIndexOffset(e, inShapeBuffer);
+                        auto outOffset = shape::getIndexOffset(sLength - e, outShapeBuffer);
                         outArr[outOffset] = inArr[inOffset];
                     }
 
@@ -128,9 +128,9 @@ static void reverseArray(nd4j::LaunchContext * context, void *vinArr, Nd4jLong *
                         PRAGMA_OMP_PARALLEL_FOR
                         for (Nd4jLong e = numOfElemsToReverse; e < inLength; e++) {
 
-                            auto inOffset  = shape::getIndexOffset(e, inShapeBuffer, inLength);
-                            auto outOffset = shape::getIndexOffset(e, outShapeBuffer, outLength);
-                            outArr[outOffset] = inArr[inOffset];        
+                            auto inOffset  = shape::getIndexOffset(e, inShapeBuffer);
+                            auto outOffset = shape::getIndexOffset(e, outShapeBuffer);
+                            outArr[outOffset] = inArr[inOffset];
                         }
                     }
                 }
@@ -151,7 +151,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input
             helpers::reverseArray<T>(context, const_cast<NDArray*>(input)->getBuffer(), const_cast<NDArray*>(input)->getShapeInfo(), output->getBuffer(), output->getShapeInfo(), seqLengths->e<int>(0));
     }
     else {
-            
+
         if(seqDim > batchDim)
             --seqDim;
 
@@ -163,7 +163,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input
         for(int i = 0; i < inSubArrsSet->size(); ++i) {
 
             Nd4jLong numOfElemsToReverse = seqLengths->e<Nd4jLong>(i);
-        
+
             if(numOfElemsToReverse == 0 || numOfElemsToReverse == 1) {
                 outSubArrsSet->at(i)->assign(inSubArrsSet->at(i));
             }
@@ -172,7 +172,7 @@ static void _reverseSequence(nd4j::LaunchContext * context, const NDArray* input
                 auto outInnerSet = outSubArrsSet->at(i)->allTensorsAlongDimension({seqDim});
                 for(int j = 0; j < inInnerSet->size(); ++j)
                     helpers::reverseArray<T>(context, inInnerSet->at(j)->getBuffer(), inInnerSet->at(j)->getShapeInfo(), outInnerSet->at(j)->getBuffer(), outInnerSet->at(j)->getShapeInfo(), numOfElemsToReverse);
-            
+
                 delete inInnerSet;
                 delete outInnerSet;
             }
@@ -195,12 +195,12 @@ void reverse(nd4j::LaunchContext * context, const NDArray* input, NDArray* outpu
 
     auto listOut = output->allTensorsAlongDimension(dimensions);
     auto listIn  = input->allTensorsAlongDimension(dimensions);
-       
+
     NDArray *subArrIn, *subArrOut;
 
     for(int i = 0; i < listIn->size(); ++i) {               // listIn->size() = listOut->size()
         subArrIn   = listIn->at(i);
-        subArrOut  = listOut->at(i);        
+        subArrOut  = listOut->at(i);
         BUILD_SINGLE_SELECTOR(input->dataType(), helpers::reverseArray, (context, subArrIn->getBuffer(), subArrIn->getShapeInfo(), subArrOut->getBuffer(), subArrOut->getShapeInfo()), LIBND4J_TYPES);
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
index cc97e3c5b..5b4c44874 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/s_t_b.cpp
@@ -116,15 +116,15 @@ static void batchToSpaceND_(const NDArray& input, const NDArray& crop, NDArray&
 
     for (Nd4jLong i = 0; i < zLen; ++i) {
 
-        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
+        shape::index2coords(i, output.getShapeInfo(), coords.data());
 
-        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
+        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
 
         // evaluate spatial coordinates for x
         for(uint j = 1; j <= numOfSpatialDims; ++j)
             coords[j] += crop.e<uint>(j - 1, 0);       // add crop left
 
-        z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
+        z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
     }
 }
 
@@ -298,9 +298,9 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
     PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) firstprivate(coords))
     for (Nd4jLong i = 0; i < zLen; ++i) {
 
-        shape::index2coords(rank, output.shapeOf(), i, zLen, coords.data());
+        shape::index2coords(i, output.getShapeInfo(), coords.data());
 
-        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), coords.data(), rank);
+        const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
 
         bool within = true;
 
@@ -318,7 +318,7 @@ static void spaceToBatchND_(const NDArray& input, const NDArray& padding, NDArra
         }
 
         if(within)
-            z[zOffset] = x[shape::getOffset(0, input.shapeOf(), input.stridesOf(), coords.data(), rank)];
+            z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
         else
             z[zOffset] = 0.f;
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
index 71181afe8..9e04ed4df 100644
--- a/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
+++ b/libnd4j/include/ops/declarable/helpers/cpu/transforms.cpp
@@ -178,8 +178,6 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
 
     const Nd4jLong* xShape  = input.shapeOf();
     const Nd4jLong* zShape  = output.shapeOf();
-    const Nd4jLong* xStride = input.stridesOf();
-    const Nd4jLong* zStride = output.stridesOf();
 
     const int rank = input.rankOf();  // both input and output have the same rank
     const int rankMinusOne = rank - 1;
@@ -195,8 +193,8 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
         PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
         for(uint i = 0; i < zLen; ++i) {
 
-            shape::index2coords(rank, zShape, i, zLen, coords.data());
-            const auto zOffset = shape::getOffset(0, zShape, zStride, coords.data(), rank);
+            shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
 
             bool within = true;
             for(int j = rankMinusOne; j >= 0; --j) {
@@ -207,7 +205,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
             }
 
             if(within)
-                z[zOffset] = x[shape::getOffset(0, xShape, xStride, coords.data(), rank)];
+                z[zOffset] = x[shape::getOffset(input.getShapeInfo(), coords.data())];
             else
                 z[zOffset] = padVal;
         }
@@ -220,8 +218,8 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
         PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(coords))
         for(uint i = 0; i < zLen; ++i) {
 
-            shape::index2coords(rank, zShape, i, zLen, coords.data());
-            const auto zOffset = shape::getOffset(0, zShape, zStride, coords.data(), rank);
+            shape::index2coords(i, output.getShapeInfo(), coords.data());
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords.data());
 
             for(int j = rankMinusOne; j >= 0; --j) {
 
@@ -231,7 +229,7 @@ void pad_(const int mode, const NDArray& input, const NDArray& paddings, NDArray
                 else if(coords[j] >= xShape[j]) coords[j] = 2 * xShape[j] - coords[j] - shift2; // means fill from right
             }
 
-            const auto xOffset = shape::getOffset(0, xShape, xStride, coords.data(), rank);
+            const auto xOffset = shape::getOffset(input.getShapeInfo(), coords.data());
             z[zOffset] = x[xOffset];
         }
     }
@@ -580,9 +578,9 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
             xCoordStart = coords.data();
         }
 
-        shape::index2coords(zRank, output.shapeOf(), i, zLen, zCoordStart);
+        shape::index2coords(i, output.getShapeInfo(), zCoordStart);
 
-        const auto zOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), zCoordStart, zRank);
+        const auto zOffset = shape::getOffset(output.getShapeInfo(), zCoordStart);
 
         // last y coordinate
         uint coordToRestore;
@@ -590,7 +588,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
             coordToRestore = static_cast<uint>(zCoordStart[yRank - 1]);
 
         zCoordStart[yRank - 1] = 0;
-        const auto yOffset = shape::getOffset(0, indices.shapeOf(), indices.stridesOf(), zCoordStart, yRank);
+        const auto yOffset = shape::getOffset(indices.getShapeInfo(), zCoordStart);
 
         //restore z coordinate
         if(yLastDim != xRank)
@@ -600,7 +598,7 @@ static void gatherND_(NDArray& input, NDArray& indices, NDArray& output) {
         for(uint j = 0; j < yLastDim; ++j)
             xCoordStart[j] = y[yOffset + j * indices.stridesOf()[yRank - 1]];   // last stride
 
-        const auto xOffset = shape::getOffset(0, input.shapeOf(), input.stridesOf(), xCoordStart, xRank);
+        const auto xOffset = shape::getOffset(input.getShapeInfo(), xCoordStart);
 
         z[zOffset] = x[xOffset];
     }
@@ -1172,7 +1170,7 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
         PRAGMA_OMP_PARALLEL_FOR_ARGS(firstprivate(inIdx, outIdx))
         for(int i = 0; i < outLen; ++i) {
 
-            shape::index2coords(rank, output.shapeOf(), i, outIdx.data());
+            shape::index2coords(i, output.getShapeInfo(), outIdx.data());
 
             for(int j = 0; j < rank; ++j) {
 
@@ -1191,8 +1189,8 @@ static void mirrorPad_(const NDArray& input, const NDArray& paddings, NDArray& o
                     inIdx[j] = len - outIdx[j];
             }
 
-            auto outOffset = shape::getOffset(0, output.shapeOf(), output.stridesOf(), outIdx.data(), rank);
-            auto inOffset  = shape::getOffset(0, input.shapeOf(),  input.stridesOf(),  inIdx.data(),  rank);
+            auto outOffset = shape::getOffset(output.getShapeInfo(), outIdx.data());
+            auto inOffset  = shape::getOffset(input.getShapeInfo(), inIdx.data());
             reinterpret_cast<T*>(output.buffer())[outOffset] = reinterpret_cast<T*>(input.getBuffer())[inOffset];
         }
     }
@@ -1259,7 +1257,7 @@ static void tileBP_(const NDArray& gradO /*input*/, NDArray& gradI /*output*/, c
         for(Nd4jLong i=0;  i<gradOLen; ++i) {
 
             auto fidx = shape::subArrayIndex(i, gradO.getShapeInfo(), gradI.getShapeInfo());
-            gradI.p(fidx, gradI.e<T>(fidx) + gradOBuff[shape::getIndexOffset(i, gradO.getShapeInfo(), gradOLen)]);
+            gradI.p(fidx, gradI.e<T>(fidx) + gradOBuff[shape::getIndexOffset(i, gradO.getShapeInfo())]);
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
index f402944aa..21b2eecd4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/activations.cu
@@ -60,9 +60,9 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
 	for (int i = tid; i < xzLen; i += totalThreads) {
 
-    	shape::index2coords(xzRank, xShapeInfo + 1, i, xzLen, coords);
+    	shape::index2coords(i, xShapeInfo, coords);
 
-		const auto xzOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xzRank + 1, coords, xzRank);
+		const auto xzOffset = shape::getOffset(xShapeInfo, coords);
 
 		const auto xVal = x[xzOffset];
 
@@ -72,7 +72,7 @@ __global__ void preluCuda(const void *vx, const Nd4jLong *xShapeInfo,
 				if(yShapeInfo[j + 1] == 1)
 					coords[j + 1] = 0;
 
-			z[xzOffset] = xVal * y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + yRank + 1, coords + 1, yRank)];
+			z[xzOffset] = xVal * y[shape::getOffset(yShapeInfo, coords + 1)];
 		}
 		else
 			z[xzOffset] = xVal;
@@ -139,11 +139,11 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 
 	for (int i = tid; i < inLen; i += totalThreads) {
 
-    	shape::index2coords(inRank, inShapeInfo + 1, i, inLen, coords);
+    	shape::index2coords(i, inShapeInfo, coords);
 
-		const auto inOffset   = shape::getOffset(0, inShapeInfo   + 1, inShapeInfo   + inRank + 1, coords, inRank);
-		const auto dLdOOffset = shape::getOffset(0, dLdOShapeInfo + 1, dLdOShapeInfo + inRank + 1, coords, inRank);
-		const auto dLdIOffset = shape::getOffset(0, dLdIShapeInfo + 1, dLdIShapeInfo + inRank + 1, coords, inRank);
+		const auto inOffset   = shape::getOffset(inShapeInfo, coords);
+		const auto dLdOOffset = shape::getOffset(dLdOShapeInfo, coords);
+		const auto dLdIOffset = shape::getOffset(dLdIShapeInfo, coords);
 
 		const auto xVal = in[inOffset];
 		const auto grO  = dLdO[dLdOOffset];
@@ -154,8 +154,8 @@ __global__ linkage void preluBPCuda(const void *vIn,    const Nd4jLong *inShapeI
 				if(alphaShapeInfo[j + 1] == 1)
 					coords[j + 1] = 0;
 
-			const auto alphaOffset = shape::getOffset(0, alphaShapeInfo + 1, alphaShapeInfo + alphaRank + 1, coords + 1, alphaRank);
-			const auto dLdAOffset  = shape::getOffset(0, dLdAShapeInfo  + 1, dLdAShapeInfo  + alphaRank + 1, coords + 1, alphaRank);
+			const auto alphaOffset = shape::getOffset(alphaShapeInfo, coords + 1);
+			const auto dLdAOffset  = shape::getOffset(dLdAShapeInfo, coords + 1);
 
 			dLdI[dLdIOffset] =  grO * alpha[alphaOffset];
 
@@ -223,7 +223,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
-			const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo, len);
+			const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[xOffset] : nd4j::math::nd4j_max<T>(x[xOffset], temp);	// take into account max element evaluated on previous iteration and stored in temp
 		}
 		else
@@ -249,8 +249,8 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
-			const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo, len);
-			const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo, len);
+			const Nd4jLong xOffset = shape::getIndexOffset(elemIdx, xShapeInfo);
+			const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo);
 			z[zOffset] = nd4j::math::nd4j_exp<T, T>(x[xOffset] - max);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? z[zOffset] : (z[zOffset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
 		}
@@ -272,7 +272,7 @@ __device__ void softMaxForVectorCuda(const void *vx, const Nd4jLong *xShapeInfo,
 	for (int i = 0; i < numOfIters; ++i) {
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx >= len) continue;
-		const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo, len);
+		const Nd4jLong zOffset = shape::getIndexOffset(elemIdx, zShapeInfo);
 		z[zOffset] /= shmem[0];
 	}
 }
@@ -386,7 +386,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
-			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
+			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp);	// take into account max element evaluated on previous iteration and stored in temp
 		}
 		else
@@ -412,7 +412,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
-			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
+			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
 			z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
 		}
@@ -434,7 +434,7 @@ __global__  void logSoftMaxForVectorCuda(const void *vx, const Nd4jLong *xzShape
 	for (int i = 0; i < numOfIters; ++i) {
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx >= len) continue;
-		const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
+		const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
 		z[offset] = nd4j::math::nd4j_log<T,T>(z[offset] / shmem[0]);
 	}
 }
@@ -505,7 +505,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
-			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
+			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? x[offset] : nd4j::math::nd4j_max<T>(x[offset], temp);	// take into account max element evaluated on previous iteration and stored in temp
 		}
 		else
@@ -531,7 +531,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx < len) {
-			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
+			const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
 			z[offset] = nd4j::math::nd4j_exp<T, T>(x[offset] - max);
 			shmem[threadIdx.x] = (threadIdx.x != 0) ? z[offset] : (z[offset] + temp); // take into account sum element evaluated on previous iteration and stored in temp
 		}
@@ -553,7 +553,7 @@ __global__ linkage void softMaxDerivForVectorCuda(const void *vx, const Nd4jLong
 	for (int i = 0; i < numOfIters; ++i) {
 		const Nd4jLong elemIdx = i * blockDim.x + threadIdx.x;
 		if(elemIdx >= len) continue;
-		const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo, len);
+		const Nd4jLong offset = shape::getIndexOffset(elemIdx, xzShapeInfo);
 		z[offset] /= shmem[0];
 		z[offset] *= (1.f - z[offset]);		// derivative
 	}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
new file mode 100644
index 000000000..7134d764a
--- /dev/null
+++ b/libnd4j/include/ops/declarable/helpers/cuda/addBias.cu
@@ -0,0 +1,110 @@
+/*******************************************************************************
+ * Copyright (c) 2015-2018 Skymind, Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Apache License, Version 2.0 which is available at
+ * https://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ ******************************************************************************/
+
+//
+// @author Yurii Shyrma (iuriish@yahoo.com)
+//
+
+
+#include<ops/declarable/helpers/addBias.h>
+#include <PointersManager.h>
+
+namespace nd4j    {
+namespace ops     {
+namespace helpers {
+
+//////////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+__global__ static void addBiasCuda( const void* vx, const Nd4jLong* xShapeInfo,
+                                    const void* vy, const Nd4jLong* yShapeInfo,
+                                          void* vz, const Nd4jLong* zShapeInfo,
+                                    const bool isNCHW) {
+
+    // bias [oC]
+
+    // if(input_rank == 4)
+        // input and output have same shapes: [bS, oH, oW, oC] (NHWC) or [bS, oC, oH, oW] (NCHW)
+    // if(input_rank == 5)
+        // input and output have same shapes: [bS, oD, oH, oW, oC] (NHWC) or [bS, oD, oC, oH, oW] (NCHW)
+
+    const X* x = reinterpret_cast<const X*>(vx);
+    const Y* y = reinterpret_cast<const Y*>(vy);
+          X* z = reinterpret_cast<X*>(vz);
+
+    __shared__ int rank, channelPosition;
+    __shared__ Nd4jLong *sharedMem, len;
+    __shared__ bool xzSameOffsets, xzAreSame;
+
+    if (threadIdx.x == 0) {
+
+        extern __shared__ unsigned char shmem[];
+        sharedMem = reinterpret_cast<Nd4jLong*>(shmem);
+
+        rank = shape::rank(xShapeInfo);     // xRank == zRank
+        xzSameOffsets = shape::haveSameShapeAndStrides(xShapeInfo, zShapeInfo);
+        len = shape::length(xShapeInfo);
+        channelPosition = isNCHW ? 1 : rank - 1;        // second or last
+        xzAreSame = x == z;
+    }
+    __syncthreads();
+
+    auto coords = sharedMem + threadIdx.x * rank;
+
+    for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < len; i += blockDim.x * gridDim.x) {
+
+        shape::index2coords(i, xShapeInfo, coords);
+
+        const auto xOffsets = shape::getOffset(xShapeInfo, coords);
+        const auto zOffsets = xzSameOffsets ? xOffsets : shape::getOffset(zShapeInfo, coords);
+        const auto yOffsets = shape::getOffset(yShapeInfo, coords + channelPosition);
+
+        if(xzAreSame)
+            z[zOffsets] += static_cast<X>(y[yOffsets]);
+        else
+            z[zOffsets] = x[xOffsets] + static_cast<X>(y[yOffsets]);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+template<typename X, typename Y>
+static void addBiasCudaLauncher(const int blocksPerGrid, const int threadsPerBlock, const int sharedMem, const cudaStream_t *stream,
+                                         const void* vx, const Nd4jLong* xShapeInfo,
+                                         const void* vy, const Nd4jLong* yShapeInfo,
+                                               void* vz, const Nd4jLong* zShapeInfo,
+                                         const bool isNCHW) {
+
+    addBiasCuda<X,Y><<<blocksPerGrid, threadsPerBlock, sharedMem, *stream>>>(vx, xShapeInfo, vy, yShapeInfo, vz, zShapeInfo, isNCHW);
+}
+
+//////////////////////////////////////////////////////////////////////////
+void addBias(nd4j::graph::Context& block, const NDArray& input, const NDArray& bias, NDArray& output, const bool isNCHW) {
+
+    PointersManager manager(block.launchContext(), "addBias");
+
+    const int threadsPerBlock = MAX_NUM_THREADS;
+    const int blocksPerGrid = (input.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
+    const int sharedMem = input.rankOf() * sizeof(Nd4jLong) * threadsPerBlock  + 128;
+
+    NDArray::prepareSpecialUse({&output}, {&input, &bias});
+    BUILD_DOUBLE_SELECTOR(input.dataType(), bias.dataType(), addBiasCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, block.launchContext()->getCudaStream(), input.getSpecialBuffer(), input.getSpecialShapeInfo(), bias.getSpecialBuffer(), bias.getSpecialShapeInfo(), output.specialBuffer(), output.specialShapeInfo(), isNCHW), FLOAT_TYPES, FLOAT_TYPES);
+    NDArray::registerSpecialUse({&output}, {&input, &bias});
+
+    manager.synchronize();
+}
+
+}
+}
+}
\ No newline at end of file
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
index c27c9fb8a..5b52d1b0b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_hue.cu
@@ -143,13 +143,13 @@ static void _CUDA_G adjustHueSingleNCHWKernel(void *xBuffer, Nd4jLong *xTadShape
 
 
     for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
-        auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-        auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-        auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo);
 
-        auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-        auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
-        auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);;
+        auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo);
 
         T h, v_min, v_max;
         helpers::rgb_to_hv(_ri[0], _gi[0], _bi[0], &h, &v_min, &v_max);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
index a1dc4189a..b801765b2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/adjust_saturation.cu
@@ -139,13 +139,13 @@ static void _CUDA_G adjustSaturationSingleNCHWKernel(void *xBuffer, Nd4jLong *xT
     auto outputB = reinterpret_cast<T *>(zBuffer) + zOffsets[2];
 
     for (Nd4jLong e = tid; e < tuples; e += blockDim.x * gridDim.x) {
-        auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-        auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-        auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        auto _ri = bufferR + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _gi = bufferG + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _bi = bufferB + shape::getIndexOffset(e, xTadShapeInfo);
 
-        auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-        auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
-        auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo, tadLength);
+        auto _ro = outputR + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _go = outputG + shape::getIndexOffset(e, xTadShapeInfo);
+        auto _bo = outputB + shape::getIndexOffset(e, xTadShapeInfo);
 
         T h, s, v;
         // Convert the RGB color to Hue/V-range.
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
index 6c3dedd20..d9188e3a8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/batchnorm.cu
@@ -64,25 +64,25 @@ __global__ static void batchnormCuda(const void* vx, const Nd4jLong* xShapeInfo,
 
     for (uint i = tid; i < minLen; i += totalThreads) {
 
-		const auto meanOffset     = shape::getIndexOffset(i, meanShapeInfo, minLen);
-    	const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo, minLen);
+		const auto meanOffset     = shape::getIndexOffset(i, meanShapeInfo);
+    	const auto varianceOffset = shape::getIndexOffset(i, varianceShapeInfo);
 
     	T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
 
     	if(gamma != nullptr)
-    		sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo, minLen)];
+    		sigmaInvGam *= gamma[shape::getIndexOffset(i, gammaShapeInfo)];
 
 		auto betaOffset = 0;
     	if(beta != nullptr)
-    		betaOffset = shape::getIndexOffset(i, betaShapeInfo, minLen);
+    		betaOffset = shape::getIndexOffset(i, betaShapeInfo);
 
     	const auto xTad = x + xTadOffsets[i];
     		  auto zTad = z + zTadOffsets[i];
 
     	for (uint j = 0; j < tadLen; ++j) {
 
-    		const auto xTadOffset = shape::getIndexOffset(j, xTadShapeInfo, tadLen);
-    		const auto zTadOffset = shape::getIndexOffset(j, zTadShapeInfo, tadLen);
+    		const auto xTadOffset = shape::getIndexOffset(j, xTadShapeInfo);
+    		const auto zTadOffset = shape::getIndexOffset(j, zTadShapeInfo);
 
     		zTad[zTadOffset] = (xTad[xTadOffset] - mean[meanOffset]) * sigmaInvGam;
 
@@ -130,10 +130,10 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
 
     for (uint i = tid; i < xLen; i += totalThreads) {
 
-        shape::index2coords(xRank, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), i, xLen, coords);
+        shape::index2coords(i, xShapeInfo, coords);
 
-        const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, xRank);
-        const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coords, xRank);
+        const auto xOffset = shape::getOffset(xShapeInfo, coords);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         if(minRank == xRank) {
             for (uint i = 0, j = 0; i < xRank; ++i) {
@@ -146,20 +146,20 @@ __global__ static void batchnormCuda2(const void* vx, const Nd4jLong* xShapeInfo
         else    // minRank = numDims = 1 in this case
             coords[0] = coords[dims[0]];
 
-        const auto meanOffset     = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(meanShapeInfo)), shape::stride(const_cast<Nd4jLong*>(meanShapeInfo)), coords, minRank);
-        const auto varianceOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(varianceShapeInfo)), shape::stride(const_cast<Nd4jLong*>(varianceShapeInfo)), coords, minRank);
+        const auto meanOffset     = shape::getOffset(meanShapeInfo, coords);
+        const auto varianceOffset = shape::getOffset(varianceShapeInfo, coords);
 
         T sigmaInvGam = 1. / nd4j::math::nd4j_sqrt<T, T>(variance[varianceOffset] + epsilon);
 
         if(gamma != nullptr) {
-            const auto gammaOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(gammaShapeInfo)), shape::stride(const_cast<Nd4jLong*>(gammaShapeInfo)), coords, minRank);
+            const auto gammaOffset = shape::getOffset(gammaShapeInfo, coords);
             sigmaInvGam *= gamma[gammaOffset];
         }
 
         z[zOffset] = (x[xOffset] - mean[meanOffset]) * sigmaInvGam;
 
         if(beta != nullptr) {
-            const auto betaOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(betaShapeInfo)), shape::stride(const_cast<Nd4jLong*>(betaShapeInfo)), coords, minRank);
+            const auto betaOffset = shape::getOffset(betaShapeInfo, coords);
             z[zOffset] += beta[betaOffset];
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu b/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu
index 87e4948ec..90619c76c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/betaInc.cu
@@ -15,7 +15,7 @@
  ******************************************************************************/
 
 //
-// Created by Yurii Shyrma on 11.12.2017
+// @author Yurii Shyrma (iuriish@yahoo.com)
 //
 
 #include<cmath>
@@ -117,10 +117,10 @@ __global__ void betaIncForArrayCuda(const void* va, const Nd4jLong* aShapeInfo,
 
     Nd4jLong len = shape::length(xShapeInfo);
 
-    const T  a = *(reinterpret_cast<const T*>(va) + shape::getIndexOffset(j, aShapeInfo, len));
-    const T  b = *(reinterpret_cast<const T*>(vb) + shape::getIndexOffset(j, bShapeInfo, len));
-    const T  x = *(reinterpret_cast<const T*>(vx) + shape::getIndexOffset(j, xShapeInfo, len));
-    	  T& z = *(reinterpret_cast<T*>(vz) 	 	 + shape::getIndexOffset(j, zShapeInfo, len));
+    const T  a = *(reinterpret_cast<const T*>(va) + shape::getIndexOffset(j, aShapeInfo));
+    const T  b = *(reinterpret_cast<const T*>(vb) + shape::getIndexOffset(j, bShapeInfo));
+    const T  x = *(reinterpret_cast<const T*>(vx) + shape::getIndexOffset(j, xShapeInfo));
+    	  T& z = *(reinterpret_cast<T*>(vz) 	  + shape::getIndexOffset(j, zShapeInfo));
 
     // t^{n-1} * (1 - t)^{n-1} is symmetric function with respect to x = 0.5
    	if(a == b && x == static_cast<T>(0.5)) {
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
index aefb97963..63e406cc6 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cppc
@@ -35,12 +35,12 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
     auto colShape  		= shape::shapeOf(colShapeBuffer);
     auto colStride 		= shape::stride(colShapeBuffer);
     auto imShape  	    = shape::shapeOf(imShapeBuffer);
-    auto imStride 	    = shape::stride(imShapeBuffer);            
+    auto imStride 	    = shape::stride(imShapeBuffer);
 
     const int bS = imShape[0];
     const int iC = imShape[1];
     const int kH = colShape[2];
-    const int kW = colShape[3];                    
+    const int kW = colShape[3];
     const int oH = colShape[4];
     const int oW = colShape[5];
     const Nd4jLong colStride0 = colStride[0];
@@ -58,31 +58,31 @@ void col2im_(nd4j::LaunchContext & context, const NDArray& input,  NDArray& outp
     const auto imEWS = shape::elementWiseStride(imShapeBuffer);
     if(imEWS == 1) {
         memset(imBuff, 0, shape::length(imShapeBuffer) * sizeof(T));
-    } 
+    }
     else if (imEWS > 1) {
 PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
         for (int i = 0; i < shape::length(imShapeBuffer) * imEWS; i += imEWS)
             imBuff[i] = static_cast<T>(0.f);
-    } 
-    else {        
+    }
+    else {
         const auto len = shape::length(imShapeBuffer);
 PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close))
-        for (int i = 0; i < len; i++)            
-            imBuff[shape::getIndexOffset(i, imShapeBuffer, len)] = static_cast<T>(0.f);
+        for (int i = 0; i < len; i++)
+            imBuff[shape::getIndexOffset(i, imShapeBuffer)] = static_cast<T>(0.f);
     }
-            
+
 	T *col, *im;
     int imRow, imCol;
 
     if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
-            
+
 PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {        
-      		for (int c = 0; c < iC; ++c) {                    
-            	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                	for (int kCol = 0; kCol < kW; ++kCol) {                            
+    	for (int b = 0; b < bS; b++) {
+      		for (int c = 0; c < iC; ++c) {
+            	for (int kRow = 0; kRow < kH; ++kRow) {
+                	for (int kCol = 0; kCol < kW; ++kCol) {
                     	for (int colH = 0; colH < oH; ++colH) {
-                        	for (int colW = 0; colW < oW; ++colW) {                    
+                        	for (int colW = 0; colW < oW; ++colW) {
 
                             	imRow = (-pH + kRow * dH) + colH*sH;
                                 imCol = (-pW + kCol * dW) + colW*sW;
@@ -97,21 +97,21 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(col, im,
                     }
                 }
             }
-        }  
+        }
     }
     else {
 
 PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col, imRow, imCol))
-    	for (int b = 0; b < bS; b++) {        
+    	for (int b = 0; b < bS; b++) {
         	for (int colH = 0; colH < oH; ++colH) {
             	for (int colW = 0; colW < oW; ++colW) {
-                	for (int c = 0; c < iC; ++c) {                        
-                    	for (int kRow = 0; kRow < kH; ++kRow) {                        
-                        	for (int kCol = 0; kCol < kW; ++kCol) {                            
-                        
+                	for (int c = 0; c < iC; ++c) {
+                    	for (int kRow = 0; kRow < kH; ++kRow) {
+                        	for (int kCol = 0; kCol < kW; ++kCol) {
+
                             	imRow = (-pH + kRow * dH) + colH*sH;
                                 imCol = (-pW + kCol * dW) + colW*sW;
-                                        
+
                                 col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
                                 im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
 
@@ -120,9 +120,9 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(static) proc_bind(close) private(im, col,
                             }
                         }
                     }
-                }                           
+                }
             }
-        }  
+        }
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
index 9ab7337c2..dc1935b83 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/col2im.cu
@@ -61,9 +61,9 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
 
     auto coords = sharedMem + threadIdx.x * colRank;
 
-    shape::index2coords(imRank, imShapeInfo + 1, imInd, imLen, coords);
+    shape::index2coords(imInd, imShapeInfo, coords);
 
-    const auto imOffset = shape::getOffset(0, imShapeInfo + 1, imShapeInfo + imRank + 1, coords, imRank);
+    const auto imOffset = shape::getOffset(imShapeInfo, coords);
 
     const int imH = coords[2] + pH;
     const int imW = coords[3] + pW;
@@ -86,7 +86,7 @@ static __global__ void col2imCuda(const void* columns, const Nd4jLong* colShapeI
                 coords[2] /= dH;
                 coords[3] /= dW;
 
-                val += col[shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank)];
+                val += col[shape::getOffset(colShapeInfo, coords)];
             }
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
index 545d7c668..d2792b630 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/compare_elem.cu
@@ -32,8 +32,8 @@ namespace nd4j {
 
                 // each thread will compare 2 elements: E and E+1
                 for (int e = tid; e < length - 1; e += blockDim.x * gridDim.x) {
-                    auto val0 = x[shape::getIndexOffset(e, xShapeInfo, length)];
-                    auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo, length)];
+                    auto val0 = x[shape::getIndexOffset(e, xShapeInfo)];
+                    auto val1 = x[shape::getIndexOffset(e+1, xShapeInfo)];
 
                     bool v = false;
                     if (isStrict)
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
index d372f05c8..6f9a8c6ab 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@@ -59,9 +59,9 @@ __global__ static void concatCuda(void* pVx,  void* pxShapeInfo, void* vz, Nd4jL
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, zShapeInfo + 1, tid, zLen, coords);
+    shape::index2coords(tid, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     int inArrIdx = 0;
     Nd4jLong *xShapeInfo = reinterpret_cast<Nd4jLong**>(pxShapeInfo)[inArrIdx];
@@ -72,7 +72,7 @@ __global__ static void concatCuda(void* pVx,  void* pxShapeInfo, void* vz, Nd4jL
     }
 
     const auto* x      = reinterpret_cast<T*>(reinterpret_cast<void**>(pVx)[inArrIdx]);
-    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+    const auto xOffset = shape::getOffset(xShapeInfo, coords);
 
     z[zOffset] = x[xOffset];
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu b/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu
index 12f14b20b..3738d7770 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/confusion.cu
@@ -59,7 +59,7 @@ namespace helpers {
             auto tZ = z + tadOffsets[label];
             T val = (weightsBuffer == nullptr ? (T)1.0f : w[t]);
 
-            auto idx = shape::getIndexOffset(pred, tadShape, arrLen);
+            auto idx = shape::getIndexOffset(pred, tadShape);
             tZ[idx] = val;
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
index c08551318..273749bfd 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/convolutions.cu
@@ -21,6 +21,7 @@
 #include <ops/declarable/helpers/convolutions.h>
 #include <ops/declarable/helpers/im2col.h>
 #include <ops/declarable/helpers/col2im.h>
+#include<ops/declarable/helpers/addBias.h>
 #include <exceptions/cuda_exception.h>
 #include <NDArrayFactory.h>
 #include <MmulHelper.h>
@@ -63,7 +64,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
 
     auto coords = sharedMem + threadIdx.x * colRank;
 
-    shape::index2coords(colRank, colShapeInfo + 1, colInd, colLen, coords);
+    shape::index2coords(colInd, colShapeInfo, coords);
 
     // const auto colW = coords[7];
     // const auto colH = coords[6];
@@ -74,7 +75,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
     // const auto c    = coords[1];
     // const auto b    = coords[0];
 
-    const auto colOffset = shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank);
+    const auto colOffset = shape::getOffset(colShapeInfo, coords);
 
     coords[2] = -pD + coords[2] * dD + coords[5] * sD;     // const auto volDep = (-pD + kDep * dD) + colD * sD;
     coords[3] = -pH + coords[3] * dH + coords[6] * sH;     // const auto volRow = (-pH + kRow * dH) + colH * sH;
@@ -83,7 +84,7 @@ static __global__ void vol2colCuda(const void* volume, const Nd4jLong* volShapeI
     if (static_cast<unsigned>(coords[2]) >= static_cast<unsigned>(iD) || static_cast<unsigned>(coords[3]) >= static_cast<unsigned>(iH) || static_cast<unsigned>(coords[4]) >= static_cast<unsigned>(iW))
         col[colOffset] = static_cast<T>(0.);
     else
-        col[colOffset] = vol[shape::getOffset(0, volShapeInfo + 1, volShapeInfo + volRank + 1, coords, volRank)];
+        col[colOffset] = vol[shape::getOffset(volShapeInfo, coords)];
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -149,9 +150,9 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
 
     auto coords = sharedMem + threadIdx.x * colRank;
 
-    shape::index2coords(volRank, volShapeInfo + 1, volInd, volLen, coords);
+    shape::index2coords(volInd, volShapeInfo, coords);
 
-    const auto volOffset = shape::getOffset(0, volShapeInfo + 1, volShapeInfo + volRank + 1, coords, volRank);
+    const auto volOffset = shape::getOffset(volShapeInfo, coords);
 
     const int imD = coords[2] + pD;
     const int imH = coords[3] + pH;
@@ -181,7 +182,7 @@ static __global__ void col2volCuda(const void* columns, const Nd4jLong* colShape
                     coords[3] /= dH;
                     coords[4] /= dW;
 
-                    val += col[shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank)];
+                    val += col[shape::getOffset(colShapeInfo, coords)];
                 }
             }
         }
@@ -268,8 +269,8 @@ static void conv2d_(nd4j::graph::Context& block, const NDArray* input, const NDA
 
     //----- add biases if required -----//
     if(bias)
-        output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
-        // helpers::addBias(*output, *bias, isNCHW);
+        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        helpers::addBias(block, *output, *bias, *output, isNCHW);
 
     if(!isNCHW)
         delete input;
@@ -283,7 +284,7 @@ void ConvolutionUtils::conv2d(nd4j::graph::Context& block, const NDArray* input,
 
 //////////////////////////////////////////////////////////////////////////
 template <typename X, typename Y>
-static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
+static void depthwiseConv2d_(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
 
     // input     [bS, iH, iW, iC] (NHWC) or [bS, iC, iH, iW] (NCHW)
     // weights   [kH, kW, iC, mC] always
@@ -330,7 +331,8 @@ static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const
     MmulHelper::tensorDot(&columns, weights, &outputReshaped, modifColumns, {{2,0,1,3},{iC,kH*kW,mC}}, modifOutput);              // [iC, bS*oH*oW, kW*kH] x [iC, kH*kW, mC] = [iC, bS*oH*oW, mC]
 
     if(bias)
-        output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        // output->applyBroadcast(broadcast::Add, {indIOioC}, bias);
+        helpers::addBias(block, *output, *bias, *output, isNCHW);
 
     if(!isNCHW)
         delete input;
@@ -338,7 +340,7 @@ static void depthwiseConv2d_(const NDArray* input, const NDArray* weights, const
 
 //////////////////////////////////////////////////////////////////////////
 void ConvolutionUtils::depthwiseConv2d(nd4j::graph::Context& block, const NDArray* input, const NDArray* weights, const NDArray* bias, NDArray* output, const int kH, const int kW, const int sH, const int sW, int pH, int pW, const int dH, const int dW, const int isSameMode, const int isNCHW) {
-    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
+    BUILD_SINGLE_SELECTOR_TWICE(input->dataType(), depthwiseConv2d_, (block, input, weights, bias, output, kH, kW, sH, sW, pH, pW, dH, dW, isSameMode, isNCHW), FLOAT_TYPES);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -735,9 +737,9 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords);
+    shape::index2coords(zInd, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     int dstart = coords[2] * sD - pD;
     int hstart = coords[3] * sH - pH;
@@ -768,7 +770,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        T val = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+                        T val = x[shape::getOffset(xShapeInfo, coords)];
                         if (val > max)
                             max = val;
                     }
@@ -784,7 +786,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+                        sum += x[shape::getOffset(xShapeInfo, coords)];
 
             if (extraParam0 == 0) {         //Exclude padding
                 uint a = (dend - dstart) / dD + ((dend - dstart) % dD == 0 ? 0 : 1);
@@ -805,7 +807,7 @@ __global__ static void pooling3dCuda(const void* vx, const Nd4jLong* xShapeInfo,
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0);
+                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 
             sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1.f / extraParam0);
 
@@ -885,9 +887,9 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, yShapeInfo + 1, yInd, yLen, coords);
+    shape::index2coords(yInd, yShapeInfo, coords);
 
-    const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + rank + 1, coords, rank);
+    const auto yOffset = shape::getOffset(yShapeInfo, coords);
 
     int hstart = coords[2] * sH - pH;
     int wstart = coords[3] * sW - pW;
@@ -913,7 +915,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             T max = -DataTypeUtils::max<T>();
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW){
-                    T val = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+                    T val = x[shape::getOffset(xShapeInfo, coords)];
                     if (val > max) {
                         max = val;
                         coord2 = coords[2];
@@ -923,7 +925,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             }
             coords[2] = coord2;
             coords[3] = coord3;
-            auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+            auto zOffset = shape::getOffset(zShapeInfo, coords);
             nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], y[yOffset]);
             //z[zOffset] += y[yOffset];
         }
@@ -941,7 +943,7 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val);
+                    nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
         }
         break;
 
@@ -953,14 +955,14 @@ __global__ static void pooling2dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH)
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW)
-                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0);
+                    sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 
             val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 
             for (coords[2] = hstart; coords[2] < hend; coords[2] += dH) {
                 for (coords[3] = wstart; coords[3] < wend; coords[3] += dW) {
-                    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
-                    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+                    const auto xOffset = shape::getOffset(xShapeInfo, coords);
+                    const auto zOffset = shape::getOffset(zShapeInfo, coords);
                     nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
                 }
             }
@@ -1046,9 +1048,9 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, yShapeInfo + 1, yInd, yLen, coords);
+    shape::index2coords(yInd, yShapeInfo, coords);
 
-    const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + rank + 1, coords, rank);
+    const auto yOffset = shape::getOffset(yShapeInfo, coords);
 
     int dstart = coords[2] * sD - pD;
     int hstart = coords[3] * sH - pH;
@@ -1080,7 +1082,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH){
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        T val = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+                        T val = x[shape::getOffset(xShapeInfo, coords)];
                         if (val > max) {
                             max = val;
                             coord2 = coords[2];
@@ -1093,7 +1095,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             coords[2] = coord2;
             coords[3] = coord3;
             coords[4] = coord4;
-            nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], y[yOffset]);
+            nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], y[yOffset]);
         }
         break;
 
@@ -1110,7 +1112,7 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank)], val);
+                        nd4j::math::atomics::nd4j_atomicAdd<T>(&z[shape::getOffset(zShapeInfo, coords)], val);
         }
         break;
 
@@ -1123,15 +1125,15 @@ __global__ static void pooling3dBPCuda(const void* vx, const Nd4jLong* xShapeInf
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD)
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH)
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW)
-                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)]), extraParam0);
+                        sum += nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[shape::getOffset(xShapeInfo, coords)]), extraParam0);
 
             val *= nd4j::math::nd4j_pow<T,T,T>(sum, ((T)1.f - extraParam0) / extraParam0);
 
             for (coords[2] = dstart; coords[2] < dend; coords[2] += dD) {
                 for (coords[3] = hstart; coords[3] < hend; coords[3] += dH) {
                     for (coords[4] = wstart; coords[4] < wend; coords[4] += dW) {
-                        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
-                        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+                        const auto xOffset = shape::getOffset(xShapeInfo, coords);
+                        const auto zOffset = shape::getOffset(zShapeInfo, coords);
                         nd4j::math::atomics::nd4j_atomicAdd<T>(&z[zOffset], val * nd4j::math::nd4j_pow<T,T,T>(nd4j::math::nd4j_abs<T>(x[xOffset]), extraParam0 - 1.f) * nd4j::math::nd4j_sgn<T,T>(x[xOffset]));
                     }
                 }
@@ -1363,14 +1365,14 @@ __global__ static void upsampling2dCuda(const void* vx, const Nd4jLong* xShapeIn
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords);
+    shape::index2coords(zInd, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     coords[dimIH]     /= factorH;
     coords[dimIH + 1] /= factorW;
 
-    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+    const auto xOffset = shape::getOffset(xShapeInfo, coords);
 
     z[zOffset] = x[xOffset];
 }
@@ -1431,15 +1433,15 @@ __global__ static void upsampling3dCuda(const void* vx, const Nd4jLong* xShapeIn
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords);
+    shape::index2coords(zInd, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     coords[dimID]     /= factorD;
     coords[dimID + 1] /= factorH;
     coords[dimID + 2] /= factorW;
 
-    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+    const auto xOffset = shape::getOffset(xShapeInfo, coords);
 
     z[zOffset] = x[xOffset];
 }
@@ -1504,9 +1506,9 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords);
+    shape::index2coords(zInd, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     z[zOffset] = 0;
 
@@ -1515,7 +1517,7 @@ __global__ static void upsampling2dBPCuda(const void* vx, const Nd4jLong* xShape
 
     for(coords[dimIH] = zCoord2; coords[dimIH] < zCoord2 + factorH; ++coords[dimIH])
         for(coords[dimIH + 1] = zCoord3; coords[dimIH + 1] < zCoord3 + factorW; ++coords[dimIH + 1])
-            z[zOffset] += x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+            z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1579,9 +1581,9 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
 
     auto coords = sharedMem + threadIdx.x * rank;
 
-    shape::index2coords(rank, zShapeInfo + 1, zInd, zLen, coords);
+    shape::index2coords(zInd, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     z[zOffset] = 0;
 
@@ -1592,7 +1594,7 @@ __global__ static void upsampling3dBPCuda(const void* vx, const Nd4jLong* xShape
     for(coords[dimID] = zCoord2; coords[dimID] < zCoord2 + factorD; ++coords[dimID])
         for(coords[dimID + 1] = zCoord3; coords[dimID + 1] < zCoord3 + factorH; ++coords[dimID + 1])
             for(coords[dimID + 2] = zCoord4; coords[dimID + 2] < zCoord4 + factorW; ++coords[dimID + 2])
-                z[zOffset] += x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+                z[zOffset] += x[shape::getOffset(xShapeInfo, coords)];
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
index e95473739..1cd771b98 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/cross.cu
@@ -58,12 +58,12 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
 
     for (uint i = tid; i < lenWithoutLastDim; i += totalThreads) {
 
-        shape::index2coords(rank - 1, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), i, lenWithoutLastDim, coords);
+        shape::index2coords(i, rank - 1, xShapeInfo + 1, coords);
 
         coords[rank - 1] = 0;
 
-        auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, rank);
-        auto yOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(yShapeInfo)), shape::stride(const_cast<Nd4jLong*>(yShapeInfo)), coords, rank);
+        auto xOffset = shape::getOffset(xShapeInfo, coords);
+        auto yOffset = shape::getOffset(yShapeInfo, coords);
 
         const auto x0 = x[xOffset];
         const auto y0 = y[yOffset];
@@ -80,7 +80,7 @@ __global__ static void crossCuda(const void* vx, const Nd4jLong* xShapeInfo,
 		const auto x2 = x[xOffset];
         const auto y2 = y[yOffset];
 
-        auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coords, rank);
+        auto zOffset = shape::getOffset(zShapeInfo, coords);
         z[zOffset] = x1 * y2 - x2 * y1;
 
         zOffset += shape::stride(const_cast<Nd4jLong*>(zShapeInfo))[rank - 1];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
index 0e861b866..fe2d412d9 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/diag.cu
@@ -42,7 +42,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
     const auto step = gridDim.x * blockDim.x;
     for (int t = tid; t < inputLength; t += step) {
-        z[shape::getIndexOffset(t * (inputLength + 1), outputShape, outputLength)] = x[shape::getIndexOffset(t, inputShape, inputLength)]; //tX];
+        z[shape::getIndexOffset(t * (inputLength + 1), outputShape)] = x[shape::getIndexOffset(t, inputShape)]; //tX];
     }
 
 }
@@ -63,7 +63,7 @@ static __global__ void diagFunctorKernel(void* outputBuffer, Nd4jLong* outputSha
         const auto step = gridDim.x * blockDim.x;
         Nd4jLong i = threadIdx.x * (outputLength + 1);
         for (int t = tid; t < outputLength && i < inputLength; t += step) {
-            z[shape::getIndexOffset(t, outputShape, outputLength)] = x[shape::getIndexOffset(i, inputShape, inputLength)]; //tX];
+            z[shape::getIndexOffset(t, outputShape)] = x[shape::getIndexOffset(i, inputShape)]; //tX];
             i += outputLength + 1;
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
index de37ab276..92aa4c55a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dilation2d.cu
@@ -72,7 +72,7 @@ __global__ static void dilation2dCuda(const void* vx, const Nd4jLong* xShapeInfo
     auto xzCoords = sharedMem + threadIdx.x * (xzRank + yRank);
     auto yCoords  = xzCoords + xzRank;
 
-    shape::index2coords(xzRank, zShapeInfo + 1, zInd, zLen, xzCoords);
+    shape::index2coords(zInd, zShapeInfo, xzCoords);
 
     const auto zOffset = shape::getOffset(zShapeInfo, xzCoords);
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
index 5b4c27bd0..9b2a42d8f 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dropout.cu
@@ -41,7 +41,7 @@ namespace helpers {
 
             // if probability is ok - we're saving scaled value
             if (double(val) < probVal)
-                output[shape::getIndexOffset(e, outputShape, inLen)] = T(input[shape::getIndexOffset(e, inputShape, inLen)] / probVal);
+                output[shape::getIndexOffset(e, outputShape)] = T(input[shape::getIndexOffset(e, inputShape)] / probVal);
         }
     }
 
@@ -140,11 +140,11 @@ namespace helpers {
         auto step = blockDim.x * gridDim.x;
 
         for (int e = tid; e < len; e += step) {
-            const auto zOffset = shape::getIndexOffset(e, outputShape, len);
+            const auto zOffset = shape::getIndexOffset(e, outputShape);
 
             // if probability was non-zero on FF step, we'll scale grads back
             if (output[zOffset] != T(0.))
-                output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape, len)] / probValue);
+                output[zOffset] = T(input[shape::getIndexOffset(e, gradOutShape)] / probValue);
 
         }
     }
@@ -173,8 +173,8 @@ namespace helpers {
 
         for (auto e = tid; e < inLen; e += step) {
             T val = nodeRng->relativeT(e, T(0.f), T(1.f));
-            T xVal = input[shape::getIndexOffset(e, inputShape, inLen)];
-            output[shape::getIndexOffset(e, outputShape, inLen)] = (val >= T(probValue) ? T(alpha * beta + alpha1) : T(alpha * (double)xVal + alpha1));
+            T xVal = input[shape::getIndexOffset(e, inputShape)];
+            output[shape::getIndexOffset(e, outputShape)] = (val >= T(probValue) ? T(alpha * beta + alpha1) : T(alpha * (double)xVal + alpha1));
         }
     }
     template <typename T>
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
index 75b541b72..c70283997 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/dynamic.cu
@@ -57,7 +57,7 @@ namespace nd4j {
                     for (Nd4jLong e = threadIdx.x; e < iLimit; e += blockDim.x) {
                         // load set of indices into shared memory
                         if (e < iLength)
-                            rawIndices[threadIdx.x] = i[shape::getIndexOffset(e, iShapeInfo, iLength)];
+                            rawIndices[threadIdx.x] = i[shape::getIndexOffset(e, iShapeInfo)];
                         __syncthreads();
 
                         // now we need to find out where our actual updates will be mapped
@@ -76,7 +76,7 @@ namespace nd4j {
                         // doing actual update
                         if (e < iLength)
                             if (trueIndices[threadIdx.x] >= 0) {
-                                z[trueIndices[threadIdx.x]] = x[shape::getIndexOffset(e, xShapeInfo, xLength)];
+                                z[trueIndices[threadIdx.x]] = x[shape::getIndexOffset(e, xShapeInfo)];
                             }
 
                         __syncthreads();
@@ -97,12 +97,12 @@ namespace nd4j {
                     int outCnt = 0;
 
                     for (Nd4jLong e = 0; e < iLength; e++) {
-                        if (indices[shape::getIndexOffset(e, iShapeInfo, iLength)] == i) {
+                        if (indices[shape::getIndexOffset(e, iShapeInfo)] == i) {
                             auto dx = x + xTadOffsets[e];
                             auto dz = z + zTadOffsets[i][outCnt++];
 
                             for (int f = threadIdx.x; f < xLength; f += blockDim.x) {
-                                dz[shape::getIndexOffset(f, zTadShapeInfos[i], xLength)] = dx[shape::getIndexOffset(f, xTadShapeInfo, xLength)];
+                                dz[shape::getIndexOffset(f, zTadShapeInfos[i])] = dx[shape::getIndexOffset(f, xTadShapeInfo)];
                             }
                         }
                     }
@@ -190,9 +190,9 @@ namespace nd4j {
                     auto iLength = shape::length(iShapeInfo);
 
                     for (int i = threadIdx.x; i < iLength; i += blockDim.x) {
-                        auto idx = indices[shape::getIndexOffset(i, iShapeInfo, iLength)];
+                        auto idx = indices[shape::getIndexOffset(i, iShapeInfo)];
                         if (idx >= 0 && idx < zLength)
-                            z[shape::getIndexOffset(idx, zShapeInfo, zLength)] = x[shape::getIndexOffset(i, xShapeInfo, iLength)];
+                            z[shape::getIndexOffset(idx, zShapeInfo)] = x[shape::getIndexOffset(i, xShapeInfo)];
                     }
                 }
             }
@@ -215,13 +215,13 @@ namespace nd4j {
                     auto xLength = shape::length(xShapeInfo);
 
                     for (int i = 0; i < iLength; i++) {
-                        auto idx = indices[shape::getIndexOffset(i, iShapeInfo, iLength)];
+                        auto idx = indices[shape::getIndexOffset(i, iShapeInfo)];
 
                         auto z = bz + zTadOffsets[idx];
                         auto x = reinterpret_cast<X*>(vx[e]) + xTadOffsets[e][i];
 
                         for (int f = threadIdx.x; f < zLength; f += blockDim.x) {
-                            z[shape::getIndexOffset(f, zTadShapeInfo, zLength)] = x[shape::getIndexOffset(f, xShapeInfo, xLength)];
+                            z[shape::getIndexOffset(f, zTadShapeInfo)] = x[shape::getIndexOffset(f, xShapeInfo)];
                         }
 
                         __syncthreads();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
index 9f6501cad..6cbedcc2a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/extract_patches.cu
@@ -66,8 +66,8 @@ namespace helpers {
 //                            for (auto pixel = 0; pixel < lastDim; pixel++) {
 //                                Nd4jLong zPos[] = {i, j, pos};
 //                                Nd4jLong xPos[] = {row, col, pixel};
-//                                auto zIndex = shape::getOffset(0, shape::shapeOf(outTadShape), shape::stride(outTadShape), zPos, 3);
-//                                auto xIndex = shape::getOffset(0, shape::shapeOf(patchShape), shape::stride(patchShape), xPos, 3);
+//                                auto zIndex = shape::getOffset(outTadShape, zPos);
+//                                auto xIndex = shape::getOffset(patchShape, xPos);
 //                                if (theSame) { // SAME case
 //                                    if (row >= 0 && col >= 0 && row < rowDim && col < colDim)
 //                                        matrix[zIndex] = patch[xIndex]; //outMatrix->p<T>(i, j, pos, patch->e<T>(row, col, pixel));
@@ -86,18 +86,6 @@ namespace helpers {
 
     template <typename T>
     static __global__ void globalExtractPatchesKernel(bool theSame, int batchCount, int sizeRow, int sizeCol, int rowDim, int colDim, int outRowDim, int outColDim, int strideRow, int strideCol, int rateRow, int rateCol, int rowCast, int colCast, int lastDim, T* input, Nd4jLong* patchShape, Nd4jLong* inputOffsets, T* output, Nd4jLong* outTadShape, Nd4jLong* outputOffsets) {
-        __shared__ Nd4jLong* xShapeOf;
-        __shared__ Nd4jLong* xStrideOf;
-        __shared__ Nd4jLong* zShapeOf;
-        __shared__ Nd4jLong* zStrideOf;
-
-        if (0 == threadIdx.x) {
-            xShapeOf = shape::shapeOf(patchShape);
-            xStrideOf = shape::stride(patchShape);
-            zShapeOf = shape::shapeOf(outTadShape);
-            zStrideOf = shape::stride(outTadShape);
-        }
-        __syncthreads();
 
         auto start = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -128,7 +116,7 @@ namespace helpers {
                                 bool setUp = (theSame && row >= 0 && col >= 0 && row < rowDim && col < colDim) || (!theSame);
 
                                 if (setUp) { // VALID or SAME cases
-                                    outMatrix[shape::getOffset(0, zShapeOf, zStrideOf, zPos, 3)] = patch[shape::getOffset(0, xShapeOf, xStrideOf, xPos, 3)];
+                                    outMatrix[shape::getOffset(outTadShape, zPos)] = patch[shape::getOffset(patchShape, xPos)];
                                 }
                                 pos++;
                             }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
index 6a818a2cd..df4e25130 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/flatten.cu
@@ -35,17 +35,11 @@ namespace nd4j {
 
                     auto xBuffer = reinterpret_cast<T*>(xBuffers[e]);
                     auto xShapeInfo = xShapeInfos[e];
-                    auto xShape = shape::shapeOf(xShapeInfo);
-                    auto xStride = shape::stride(xShapeInfo);
-                    auto xRank = shape::rank(xShapeInfo);
                     auto xLength = shape::length(xShapeInfo);
 
                     // each element of this input array has own place within common output array
-                    for (uint i = threadIdx.x; i < xLength; i += blockDim.x) {
-                        shape::index2coords(xRank, xShape, i, xLength, xCoord, order);
-                        auto xOffset = shape::getOffset(0, xShape, xStride, xCoord, xRank);
-                        z[i] = xBuffer[xOffset];
-                    }
+                    for (uint i = threadIdx.x; i < xLength; i += blockDim.x)
+                        z[i] = xBuffer[getIndexOffsetOrdered(i, xShapeInfo, order)];
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
index 4eb5450a3..308e58814 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather.cu
@@ -52,10 +52,9 @@ namespace helpers {
     auto step = blockDim.x * gridDim.x;
 
     for (int j = start; j < zLen; j += step) {
-        auto zIndex = shape::getIndexOffset(j, zShapeInfo, zLen);
-        auto yIndex = shape::getIndexOffset(j, yShapeInfo, yLen);
-        auto xIndex = shape::getIndexOffset(y[yIndex], xShapeInfo, xLen);
-        //printf("%lld , %lld\n", zIndex, xIndex);
+        auto zIndex = shape::getIndexOffset(j, zShapeInfo);
+        auto yIndex = shape::getIndexOffset(j, yShapeInfo);
+        auto xIndex = shape::getIndexOffset(y[yIndex], xShapeInfo);
         z[zIndex] = x[xIndex];
     }
 }
@@ -76,15 +75,14 @@ __global__ static void gatherCuda(const int numOfSubArrs,
     for (int i = blockIdx.x; i < numOfSubArrs; i += gridDim.x) {
 
         if (threadIdx.x == 0) {
-            x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo, numOfSubArrs)]];
+            x = reinterpret_cast<const X*>(vx) + xOffsets[y[shape::getIndexOffset(i, yShapeInfo)]];
             z = reinterpret_cast<X*>(vz) + zOffsets[i];
         }
         __syncthreads();
 
         for (int j = threadIdx.x; j < len; j += blockDim.x) {
-            auto zIndex = shape::getIndexOffset(j, zShapeInfo, len);
-            auto xIndex = shape::getIndexOffset(j, xShapeInfo, len);
-            //printf("%lld , %lld\n", zIndex, xIndex);
+            auto zIndex = shape::getIndexOffset(j, zShapeInfo);
+            auto xIndex = shape::getIndexOffset(j, xShapeInfo);
             z[zIndex] = x[xIndex];
         }
         __syncthreads();
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
index 6587b4ca7..11ba6571b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu
@@ -83,9 +83,9 @@ namespace nd4j {
 
                 for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-                    shape::index2coords(zRank, zShapeInfo + 1, i, zLen, zCoordStart);
+                    shape::index2coords(i, zShapeInfo, zCoordStart);
 
-                    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + zRank + 1, zCoordStart, zRank);
+                    const auto zOffset = shape::getOffset(zShapeInfo, zCoordStart);
 
                     // last y coordinate
                     int coordToRestore;
@@ -93,7 +93,7 @@ namespace nd4j {
                         coordToRestore = static_cast<int>(zCoordStart[yRank - 1]);
 
                     zCoordStart[yRank - 1] = 0; // last y coordinate
-                    const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + yRank + 1, zCoordStart, yRank);
+                    const auto yOffset = shape::getOffset(yShapeInfo, zCoordStart);
 
                     //restore z coordinate
                     if(yLastDim != xRank)
@@ -103,7 +103,7 @@ namespace nd4j {
                     for(uint j = 0; j < yLastDim; ++j)
                         xCoordStart[j] = y[yOffset + j * yShapeInfo[2 * yRank]];   // last stride
 
-                    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, xCoordStart, xRank);
+                    const auto xOffset = shape::getOffset(xShapeInfo, xCoordStart);
 
                     z[zOffset] = x[xOffset];
                     printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
index 3bc30e373..9802ff231 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/hamming.cu
@@ -43,8 +43,8 @@ namespace nd4j {
 
                 auto tid = threadIdx.x + blockIdx.x * blockDim.x;
                 for (Nd4jLong e = tid; e < length; e += blockDim.x * gridDim.x) {
-                    auto _x = static_cast<unsigned long long>(x[shape::getIndexOffset(e, xShapeInfo, length)]);
-                    auto _y = static_cast<unsigned long long>(y[shape::getIndexOffset(e, yShapeInfo, length)]);
+                    auto _x = static_cast<unsigned long long>(x[shape::getIndexOffset(e, xShapeInfo)]);
+                    auto _y = static_cast<unsigned long long>(y[shape::getIndexOffset(e, yShapeInfo)]);
 
                     // we save intermediate result into shared memory
                     shared[threadIdx.x] += __popcll(_x ^ _y);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu
index 317f1d857..07d7bcd93 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/histogramFixedWidth.cu
@@ -55,7 +55,7 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong*
 
     for (Nd4jLong i = tid; i < xLen; i += totalThreads) {
 
-        const X value = x[shape::getIndexOffset(i, xShapeInfo, xLen)];
+        const X value = x[shape::getIndexOffset(i, xShapeInfo)];
 
         Nd4jLong zIndex;
 
@@ -66,7 +66,7 @@ __global__ static void histogramFixedWidthCuda( const void* vx, const Nd4jLong*
         else
             zIndex = static_cast<Nd4jLong>((value - leftEdge) / binWidth);
 
-        nd4j::math::atomics::nd4j_atomicAdd<Z>(&z[shape::getIndexOffset(zIndex, zShapeInfo, nbins)], 1);
+        nd4j::math::atomics::nd4j_atomicAdd<Z>(&z[shape::getIndexOffset(zIndex, zShapeInfo)], 1);
     }
 }
 
@@ -101,7 +101,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con
 //         const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
 //         const auto step = gridDim.x * blockDim.x;
 //         for (int t = tid; t < bufferLength; t += step) {
-//             destination[t] = reinterpret_cast<T const*>(source)[shape::getIndexOffset(t, sourceShape, bufferLength)];
+//             destination[t] = reinterpret_cast<T const*>(source)[shape::getIndexOffset(t, sourceShape)];
 //         }
 //     }
 
@@ -110,7 +110,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con
 //         const auto tid = blockIdx.x * gridDim.x + threadIdx.x;
 //         const auto step = gridDim.x * blockDim.x;
 //         for (int t = tid; t < bufferLength; t += step) {
-//             reinterpret_cast<T*>(destination)[shape::getIndexOffset(t, destinationShape, bufferLength)] = source[t];
+//             reinterpret_cast<T*>(destination)[shape::getIndexOffset(t, destinationShape)] = source[t];
 //         }
 //     }
 
@@ -130,7 +130,7 @@ void histogramFixedWidth(nd4j::LaunchContext* context, const NDArray& input, con
 
 //         for(auto i = tid; i < inputLength; i += step) {
 
-//             const T value = x[shape::getIndexOffset(i, inputShape, inputLength)];
+//             const T value = x[shape::getIndexOffset(i, inputShape)];
 //             Nd4jLong currInd = static_cast<Nd4jLong>((value - leftEdge) / binWidth);
 
 //             if(value < secondEdge)
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
index f2fb9d94a..62fcd0588 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/im2col.cu
@@ -64,9 +64,9 @@ __global__ static void im2colCuda(const void *image, void *columns,
 
     auto coords = sharedMem + threadIdx.x * colRank;
 
-    shape::index2coords(colRank, colShapeInfo + 1, colInd, colLen, coords);
+    shape::index2coords(colInd, colShapeInfo, coords);
 
-    const auto colOffset = shape::getOffset(0, colShapeInfo + 1, colShapeInfo + colRank + 1, coords, colRank);
+    const auto colOffset = shape::getOffset(colShapeInfo, coords);
 
     coords[2] = (-pH + coords[2] * dH) + coords[4] * sH;   // imH
     coords[3] = (-pW + coords[3] * dW) + coords[5] * sW;   // imW
@@ -74,7 +74,7 @@ __global__ static void im2colCuda(const void *image, void *columns,
     if (static_cast<unsigned>(coords[2]) >= static_cast<unsigned>(iH) || static_cast<unsigned>(coords[3]) >= static_cast<unsigned>(iW))
         col[colOffset] = zeroPadVal;
     else
-        col[colOffset] = im[shape::getOffset(0, imShapeInfo + 1, imShapeInfo + imRank + 1, coords, imRank)];
+        col[colOffset] = im[shape::getOffset(imShapeInfo, coords)];
 }
 
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
index 431524bf3..715792a8c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_resize.cu
@@ -196,8 +196,8 @@ namespace helpers {
                     for (Nd4jLong e = start; e < channels; e += step) {
                         Nd4jLong posX[] = {b, inY, inX, e};
                         Nd4jLong posZ[] = {b, y, x, e};
-                        auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 4);
-                        auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), posZ, 4);
+                        auto xIndex = shape::getOffset(inputShape, posX);
+                        auto zIndex = shape::getOffset(outputShape, posZ);
                         output[zIndex] = input[xIndex];
                     }
                 }
@@ -284,10 +284,10 @@ namespace helpers {
             Nd4jLong y1Pos[] = {b, 0};
             Nd4jLong y2Pos[] = {b, 2};
             Nd4jLong x2Pos[] = {b, 3};
-            Z y1 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), y1Pos, 2)];//->t<T>(b, 0)];
-            Z x1 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), x1Pos, 2)];
-            Z y2 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), y2Pos, 2)];
-            Z x2 = boxes[shape::getOffset(0, shape::shapeOf(boxesShape), shape::stride(boxesShape), x2Pos, 2)];
+            Z y1 = boxes[shape::getOffset(boxesShape, y1Pos)];//->t<T>(b, 0)];
+            Z x1 = boxes[shape::getOffset(boxesShape, x1Pos)];
+            Z y2 = boxes[shape::getOffset(boxesShape, y2Pos)];
+            Z x2 = boxes[shape::getOffset(boxesShape, x2Pos)];
 
             int bIn = indices[b];
             if (bIn >= batchSize) {
@@ -308,7 +308,7 @@ namespace helpers {
                         auto step = blockDim.z * gridDim.z;
                         for (int d = start; d < depth; d += step) {
                             Nd4jLong zPos[] = {b, y, x, d};
-                            auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4);
+                            auto zIndex = shape::getOffset(outputShape, zPos);
                             output[zIndex] = (Z)extrapolationVal;
                             //crops->p(b, y, x, d, extrapolationVal);
                         }
@@ -329,7 +329,7 @@ namespace helpers {
                             auto step = blockDim.z * gridDim.z;
                             for (int d = start; d < depth; d += step) {
                                 Nd4jLong zPos[] = {b, y, x, d};
-                                auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4);
+                                auto zIndex = shape::getOffset(outputShape, zPos);
                                 output[zIndex] = (Z)extrapolationVal;
 //                                crops->p(b, y, x, d, extrapolationVal);
                             }
@@ -346,14 +346,14 @@ namespace helpers {
                             Nd4jLong topRightPos[] = {bIn, topYIndex, right_x_index, d};
                             Nd4jLong bottomLeftPos[] = {bIn, bottomYIndex, left_x_index, d};
                             Nd4jLong bottomRightPos[] = {bIn, bottomYIndex, right_x_index, d};
-                            const T topLeft(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), topLeftPos, 4)]); //->e<float>(bIn, topYIndex, left_x_index, d));
-                            const T topRight(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), topRightPos, 4)]); //->e<float>(bIn, topYIndex, right_x_index, d));
-                            const T bottomLeft(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), bottomLeftPos, 4)]);//->e<float>(bIn, bottomYIndex, left_x_index, d));
-                            const T bottomRight(images[shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), bottomRightPos, 4)]); //->e<float>(bIn, bottomYIndex, right_x_index, d));
+                            const T topLeft(images[shape::getOffset(imagesShape, topLeftPos)]); //->e<float>(bIn, topYIndex, left_x_index, d));
+                            const T topRight(images[shape::getOffset(imagesShape, topRightPos)]); //->e<float>(bIn, topYIndex, right_x_index, d));
+                            const T bottomLeft(images[shape::getOffset(imagesShape, bottomLeftPos)]);//->e<float>(bIn, bottomYIndex, left_x_index, d));
+                            const T bottomRight(images[shape::getOffset(imagesShape, bottomRightPos)]); //->e<float>(bIn, bottomYIndex, right_x_index, d));
                             const T top = topLeft + (topRight - topLeft) * x_lerp;
                             const T bottom = bottomLeft + (bottomRight - bottomLeft) * x_lerp;
                             Nd4jLong zPos[] = {b, y, x, d};
-                            auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4);
+                            auto zIndex = shape::getOffset(outputShape, zPos);
                             output[zIndex] = Z(top + (bottom - top) * y_lerp);
 //                            crops->p(b, y, x, d, top + (bottom - top) * y_lerp);
                         }
@@ -368,7 +368,7 @@ namespace helpers {
                             auto step = blockDim.z * gridDim.z;
                             for (int d = start; d < depth; d += step) {
                                 Nd4jLong zPos[] = {b, y, x, d};
-                                auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4);
+                                auto zIndex = shape::getOffset(outputShape, zPos);
                                 output[zIndex] = (Z)extrapolationVal;
                             }
                             continue;
@@ -380,8 +380,8 @@ namespace helpers {
                         for (int d = start; d < depth; d += step) {
                             Nd4jLong zPos[] = {b, y, x, d};
                             Nd4jLong xPos[] = {bIn, closestYIndex, closestXIndex, d};
-                            auto zIndex = shape::getOffset(0, shape::shapeOf(outputShape), shape::stride(outputShape), zPos, 4);
-                            auto xIndex = shape::getOffset(0, shape::shapeOf(imagesShape), shape::stride(imagesShape), xPos, 4);
+                            auto zIndex = shape::getOffset(outputShape, zPos);
+                            auto xIndex = shape::getOffset(imagesShape, xPos);
                             output[zIndex] = images[xIndex];
 //                            crops->p(b, y, x, d, images->e<T>(bIn, closestYIndex, closestXIndex, d));
                         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
index d96c1efa2..d221ae023 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/image_suppression.cu
@@ -37,16 +37,15 @@ namespace helpers {
         Nd4jLong next1[] = {nextIndex, 1};
         Nd4jLong next2[] = {nextIndex, 2};
         Nd4jLong next3[] = {nextIndex, 3};
-        Nd4jLong* shapeOf = shape::shapeOf(boxesShape);
-        Nd4jLong* strideOf = shape::stride(boxesShape);
-        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, previous0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous2, 2)]);
-        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, previous1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous3, 2)]);
-        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, previous0, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous2, 2)]);
-        T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, previous1, 2)], boxes[shape::getOffset(0, shapeOf, strideOf, previous3, 2)]);
-        T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, next0, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next2, 2)]);
-        T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(0, shapeOf, strideOf, next1, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next3, 2)]);
-        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, next0, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next2, 2)]);
-        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(0, shapeOf, strideOf, next1, 2)],     boxes[shape::getOffset(0, shapeOf, strideOf, next3, 2)]);
+
+        T minYPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
+        T minXPrev = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
+        T maxYPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous0)], boxes[shape::getOffset(boxesShape, previous2)]);
+        T maxXPrev = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, previous1)], boxes[shape::getOffset(boxesShape, previous3)]);
+        T minYNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
+        T minXNext = nd4j::math::nd4j_min(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
+        T maxYNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next0)],     boxes[shape::getOffset(boxesShape, next2)]);
+        T maxXNext = nd4j::math::nd4j_max(boxes[shape::getOffset(boxesShape, next1)],     boxes[shape::getOffset(boxesShape, next3)]);
 
         T areaPrev = (maxYPrev - minYPrev) * (maxXPrev - minXPrev);
         T areaNext = (maxYNext - minYNext) * (maxXNext - minXNext);
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
index f0d1df1cc..ec4fd2a97 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/lup.cu
@@ -47,10 +47,10 @@ namespace helpers {
             Nd4jLong pos[] = {i, i - 1};
             Nd4jLong posX[] = {i, i};
             Nd4jLong posY[] = {i - 1, i - 1};
-            auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
-            auto dxIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
-            auto dyIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posY, 2);
-            auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
+            auto xIndex = shape::getOffset(inputShape, pos);
+            auto dxIndex = shape::getOffset(inputShape, posX);
+            auto dyIndex = shape::getOffset(inputShape, posY);
+            auto zIndex = shape::getOffset(invertedShape, pos);
             // invert lower triangular matrix
             inverted[zIndex] = -input[xIndex] / (input[dxIndex] * input[dyIndex]);
 //            math::atomics::nd4j_atomicAdd(&inverted[zIndex], - input[xIndex] * inverted[iIndex] / input[dIndex]);
@@ -69,8 +69,8 @@ namespace helpers {
 
         for (int i = start; i < n; i += step) {
             Nd4jLong pos[] = {i, i};
-            auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), pos, 2);
-            auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), pos, 2);
+            auto xIndex = shape::getOffset(inputShape, pos);
+            auto zIndex = shape::getOffset(invertedShape, pos);
 //            math::atomics::nd4j_atomicDiv(&inverted[zIndex], input[xIndex]);
             // invert diagonal elements
             inverted[zIndex] /= input[xIndex];
@@ -85,18 +85,9 @@ namespace helpers {
 
         __shared__ T* inverted;
         __shared__ T* input;
-        __shared__ Nd4jLong* inputStride;
-        __shared__ Nd4jLong* invertedStride;
-        __shared__ Nd4jLong* invertedShapeOf;
-        __shared__ Nd4jLong* inputShapeOf;
         if (threadIdx.x == 0) {
             inverted = reinterpret_cast<T *>(invertedBuf);
             input = reinterpret_cast<T *>(inputBuf);
-            inputStride = shape::stride(inputShape);
-            invertedStride = shape::stride(invertedShape);
-            invertedShapeOf = shape::shapeOf(invertedShape);
-            inputShapeOf = shape::shapeOf(inputShape);
-
         }
         __syncthreads();
 
@@ -106,9 +97,9 @@ namespace helpers {
         for (int i = start; i < n - 1; i += step) {
             Nd4jLong pos[] = {i, i + 1};
             Nd4jLong posX[] = {i + 1, i + 1};
-            auto xIndex = shape::getOffset(0, inputShapeOf, shape::stride(inputShape), pos, 2);
-            auto iIndex = shape::getOffset(0, invertedShapeOf, invertedStride, posX, 2);
-            auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStride, pos, 2);
+            auto xIndex = shape::getOffset(inputShape, pos);
+            auto iIndex = shape::getOffset(invertedShape, posX);
+            auto zIndex = shape::getOffset(invertedShape, pos);
             // invert upper matrix
             math::atomics::nd4j_atomicAdd(&inverted[zIndex], -input[xIndex] * inverted[iIndex]); // / input[yIndex]);
             //inputMatrix->t<T>(i, i + 1) * invertedMatrix->t<T>(i + 1, i + 1) / inputMatrix->t<T>(i, i)
@@ -130,12 +121,10 @@ namespace helpers {
                     Nd4jLong posX[] = {i, k};
                     Nd4jLong posD[] = {i, i};
 
-                    auto xIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posX, 2);
-                    auto yIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posY,
-                                                   2);
-                    auto dIndex = shape::getOffset(0, shape::shapeOf(inputShape), shape::stride(inputShape), posD, 2);
-                    auto zIndex = shape::getOffset(0, shape::shapeOf(invertedShape), shape::stride(invertedShape), posZ,
-                                                   2);
+                    auto xIndex = shape::getOffset(inputShape, posX);
+                    auto yIndex = shape::getOffset(invertedShape, posY);
+                    auto dIndex = shape::getOffset(inputShape, posD);
+                    auto zIndex = shape::getOffset(invertedShape, posZ);
                     // invert non-diagonal elements
                     math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex] / input[dIndex]);
                 }
@@ -149,18 +138,10 @@ namespace helpers {
     invertUpKernel(void *invertedBuf, Nd4jLong *invertedShape, void *inputBuf, Nd4jLong *inputShape, Nd4jLong n) {
         __shared__ T* inverted;
         __shared__ T* input;
-        __shared__ Nd4jLong* inputShapeOf;
-        __shared__ Nd4jLong* invertedShapeOf;
-        __shared__ Nd4jLong* invertedStrideOf;
-        __shared__ Nd4jLong* inputStrideOf;
 
         if (threadIdx.x == 0) {
             inverted = reinterpret_cast<T *>(invertedBuf);;
             input = reinterpret_cast<T *>(inputBuf);
-            inputShapeOf = shape::shapeOf(inputShape);
-            invertedShapeOf = shape::shapeOf(invertedShape);
-            inputStrideOf = shape::stride(inputShape);
-            invertedStrideOf = shape::stride(invertedShape);
         }
         __syncthreads();
 
@@ -171,9 +152,9 @@ namespace helpers {
                     Nd4jLong posY[] = {k, j};
                     Nd4jLong posX[] = {i, k};
                     // inversion with Joardan Gauss transformation
-                    auto xIndex = shape::getOffset(0, inputShapeOf, inputStrideOf, posX, 2);
-                    auto yIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posY, 2);
-                    auto zIndex = shape::getOffset(0, invertedShapeOf, invertedStrideOf, posZ, 2);
+                    auto xIndex = shape::getOffset(inputShape, posX);
+                    auto yIndex = shape::getOffset(invertedShape, posY);
+                    auto zIndex = shape::getOffset(invertedShape, posZ);
                     // invert upper non-diagonal elements
                     math::atomics::nd4j_atomicAdd(&inverted[zIndex], -inverted[yIndex] * input[xIndex]);
                 }
@@ -289,7 +270,7 @@ namespace helpers {
         auto step = blockDim.x * gridDim.x;
 
         for (int k = pos + start, j = start; j < n2; k += step, j += step) {
-            auto xIndex = shape::getIndexOffset(k, inputShape, inputLen);
+            auto xIndex = shape::getIndexOffset(k, inputShape);
             matrix[j] = (F) inputBuf[xIndex];
         }
     }
@@ -315,7 +296,7 @@ namespace helpers {
         auto step = blockDim.x * gridDim.x;
 
         for (int k = pos + start, j = start; j < n2; k += step, j += step) {
-            auto zIndex = shape::getIndexOffset(k, outputShape, outputLen);
+            auto zIndex = shape::getIndexOffset(k, outputShape);
             outputBuf[zIndex] = matrix[j];
         }
     }
@@ -331,7 +312,7 @@ namespace helpers {
         for (auto i = start; i < rowNum; i += step) {
             int val = source[i] - 1;
             Nd4jLong posF[] = {i, val};
-            auto pos = shape::getOffset(0, shape::shapeOf(shape), shape::stride(shape), posF, 2);
+            auto pos = shape::getOffset(shape, posF);
             permutation[pos] = F(1.f);
         }
     }
@@ -522,7 +503,7 @@ namespace helpers {
             lup_<T>(context, &matrix, nullptr, nullptr);
 //            else
 //                lup_<float>(context, &matrix, nullptr, nullptr);
-            auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf());
+            auto offset = shape::getIndexOffset(e, output->shapeInfo());
             auto inputBuf = reinterpret_cast<T *>(matrix.specialBuffer());
             auto outputBuf = reinterpret_cast<T *>(output->specialBuffer()) + offset;
 //            if (matrix.dataType() == input->dataType())
@@ -570,7 +551,7 @@ namespace helpers {
                 lup_<T>(context, &matrix, nullptr, nullptr);
 //            else
 //                lup_<float>(context, &matrix, nullptr, nullptr);
-                auto offset = shape::getIndexOffset(e, output->shapeInfo(), output->lengthOf());
+                auto offset = shape::getIndexOffset(e, output->shapeInfo());
                 auto inputBuf = reinterpret_cast<T *>(matrix.specialBuffer());
                 auto outputBuf = reinterpret_cast<T *>(output->specialBuffer()) + offset;
 //            if (matrix.dataType() == input->dataType())
@@ -596,34 +577,11 @@ namespace helpers {
         fillLowerUpperKernel(void *lowerBuf, Nd4jLong *lowerShape, void *upperBuf, Nd4jLong *upperShape,
                              void *matrixBuf, Nd4jLong *matrixShape, Nd4jLong n) {
 
-            __shared__
-            Nd4jLong *xShapeOf;
-            __shared__
-            Nd4jLong *yShapeOf;
-            __shared__
-            Nd4jLong *zShapeOf;
-            __shared__
-            Nd4jLong *xStrideOf;
-            __shared__
-            Nd4jLong *yStrideOf;
-            __shared__
-            Nd4jLong *zStrideOf;
-            __shared__
-            T *lowerMatrix;
-            __shared__
-            T *upperMatrix;
-            __shared__
-            T *matrix;
+            __shared__ T *lowerMatrix;
+            __shared__ T *upperMatrix;
+            __shared__ T *matrix;
 
             if (threadIdx.x == 0) {
-                xShapeOf = shape::shapeOf(lowerShape);
-                xStrideOf = shape::stride(lowerShape);
-
-                yShapeOf = shape::shapeOf(upperShape);
-                yStrideOf = shape::stride(upperShape);
-
-                zShapeOf = shape::shapeOf(matrixShape);
-                zStrideOf = shape::stride(matrixShape);
                 lowerMatrix = reinterpret_cast<T *>(lowerBuf);
                 upperMatrix = reinterpret_cast<T *>(upperBuf);
                 matrix = reinterpret_cast<T *>(matrixBuf);
@@ -634,10 +592,10 @@ namespace helpers {
                 for (int j = threadIdx.x; j < n; j += blockDim.x) {
                     Nd4jLong posX[] = {k, j};
                     Nd4jLong posD[] = {j, j};
-                    auto xPos = shape::getOffset(0, xShapeOf, xStrideOf, posX, 2);
-                    auto yPos = shape::getOffset(0, yShapeOf, yStrideOf, posX, 2);
-                    auto iPos = shape::getOffset(0, zShapeOf, zStrideOf, posX, 2);
-                    auto dPos = shape::getOffset(0, zShapeOf, zStrideOf, posD, 2);
+                    auto xPos = shape::getOffset(lowerShape, posX);
+                    auto yPos = shape::getOffset(upperShape, posX);
+                    auto iPos = shape::getOffset(matrixShape, posX);
+                    auto dPos = shape::getOffset(matrixShape, posD);
                     if (k >= j)
                         lowerMatrix[xPos] = matrix[iPos];//(k, j);
                     else
@@ -850,18 +808,14 @@ namespace helpers {
             T *output = outputBuf;
             T *input = inputBuf;
 
-            Nd4jLong *shapeOf = shape::shapeOf(tadShape);
-            Nd4jLong *strideOf = shape::stride(tadShape);
-
             for (auto i = blockIdx.x; i < batchNum; i += gridDim.x) {
                 T *current = input + tadOffsets[i];
 
-                auto zIndex = shape::getIndexOffset(i, outputShape, batchNum);
+                auto zIndex = shape::getIndexOffset(i, outputShape);
                 for (auto e = threadIdx.x; e < n; e += blockDim.x) {
                     Nd4jLong diag[] = {e, e};
-                    auto xIndex = shape::getOffset(0, shapeOf, strideOf, diag, 2);
-                    math::atomics::nd4j_atomicAdd(&output[zIndex],
-                                                  math::nd4j_log<T, T>(current[xIndex] * current[xIndex]));
+                    auto xIndex = shape::getOffset(tadShape, diag);
+                    math::atomics::nd4j_atomicAdd(&output[zIndex],math::nd4j_log<T, T>(current[xIndex] * current[xIndex]));
                 }
             }
         }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
index 01baaffb4..a3c754cf5 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrixSetDiag.cu
@@ -61,14 +61,14 @@ __global__ static void matrixSetDiagCuda(const void* vx, const Nd4jLong* xShapeI
 
     for (Nd4jLong i = tid; i < xLen; i += gridDim.x * blockDim.x) {
 
-        shape::index2coords(xRank, xShapeInfo + 1, i, xLen, coords);
+        shape::index2coords(i, xShapeInfo, coords);
 
-        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank);
-        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank);
+        const auto xOffset = shape::getOffset(xShapeInfo, coords);
+        const auto zOffset = areSameOffsets ? xOffset : shape::getOffset(zShapeInfo, coords);
 
         // condition to be on diagonal of innermost matrix
         if(coords[xRank - 2] == coords[xRank - 1])
-            z[zOffset] = y[shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank, coords, xRank - 1)];
+            z[zOffset] = y[shape::getOffset(yShapeInfo, coords)];
         else
             z[zOffset] = zeroPad ? static_cast<T>(0) : x[xOffset];
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
index 41b71f5d7..e72ab1f5c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_band.cu
@@ -40,11 +40,9 @@ namespace helpers {
             for (Nd4jLong i = blockIdx.y; i < rows; i += gridDim.y) {
                 for (Nd4jLong j = threadIdx.x; j < cols; j += totalThreads) {
                     Nd4jLong coords[2] = {i, j};
-                    Nd4jLong tadOffsetOut = shape::getOffset(0, shape::shapeOf(tadOnlyOutputShapeInfo),
-                                                             shape::stride(tadOnlyOutputShapeInfo), coords, 2);
-                    Nd4jLong tadOffsetIn = shape::getOffset(0, shape::shapeOf(tadOnlyInputShapeInfo),
-                                                            shape::stride(tadOnlyInputShapeInfo), coords, 2);
-                    //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
+                    Nd4jLong tadOffsetOut = shape::getOffset(tadOnlyOutputShapeInfo, coords);
+                    Nd4jLong tadOffsetIn = shape::getOffset(tadOnlyInputShapeInfo, coords);
+                    //shape::getIndexOffset(j, tadOnlyOutputShapeInfo)
                     if (i >= j) { // check lower diagonals
                         if (lowerBand > 0) {
                             if ((i - j) > lowerBand)
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
index a83067f01..ea428acb2 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/matrix_diag_part.cu
@@ -41,9 +41,9 @@ namespace helpers {
             auto xOffset = tadOutputOffsets[i];
             for (Nd4jLong j = threadIdx.x; j < inputLength; j += totalThreads) {
                 Nd4jLong coords[2] = {j, j};
-                Nd4jLong tadOffset = shape::getOffset(0, shape::shapeOf(tadOnlyInputShapeInfo), shape::stride(tadOnlyInputShapeInfo), coords, 2);
-                //shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)
-                *(reinterpret_cast<T*>(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo, inputLength)) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffset);
+                Nd4jLong tadOffset = shape::getOffset(tadOnlyInputShapeInfo, coords);
+                //shape::getIndexOffset(j, tadOnlyOutputShapeInfo)
+                *(reinterpret_cast<T*>(outputBuffer) + xOffset + shape::getIndexOffset(j, tadOnlyOutputShapeInfo)) = *(reinterpret_cast<T const*>(inputBuffer) + yOffset + tadOffset);
             }
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu b/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu
index d5af6328a..aa129ee8e 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/max_pooling.cu
@@ -27,12 +27,12 @@ namespace ops {
 namespace helpers {
 
     template <typename Z>
-    static _CUDA_G void indicesFiller(void *vz, Nd4jLong *zShapeInfo, Nd4jLong zLength, Nd4jLong part, Nd4jLong bSize) {
+    static _CUDA_G void indicesFiller(void *vz, Nd4jLong *zShapeInfo, Nd4jLong part, Nd4jLong bSize) {
         auto z = reinterpret_cast<Z*>(vz);
 
         for (int b = blockIdx.x; b < bSize; b += gridDim.x) {
             for (Nd4jLong e = threadIdx.x; e < part; e += blockDim.x) {
-                z[shape::getIndexOffset(e + b * part, zShapeInfo, zLength)] = static_cast<Z>(e);
+                z[shape::getIndexOffset(e + b * part, zShapeInfo)] = static_cast<Z>(e);
             }
         }
     }
@@ -74,7 +74,7 @@ namespace helpers {
             auto total = input->lengthOf();
             auto part = total / bSize;
 
-            indicesFiller<Y><<<256, 256, 1024, *block.launchContext()->getCudaStream()>>>(indices->specialBuffer(), indices->specialShapeInfo(), indices->lengthOf(), part, bSize);
+            indicesFiller<Y><<<256, 256, 1024, *block.launchContext()->getCudaStream()>>>(indices->specialBuffer(), indices->specialShapeInfo(), part, bSize);
 
             /*
             for (int k = 0; k < total; )
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
index 27c8fc630..14fda24ec 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/merge.cu
@@ -47,7 +47,7 @@ namespace nd4j {
                     for (int i = 0; i < numArrays; i++) {
                         auto x = reinterpret_cast<T*>(inArrs[i]);
                         auto xShape = reinterpret_cast<Nd4jLong *>(inShapes[i]);
-                        auto val = x[shape::getIndexOffset(e, xShape, length)];;
+                        auto val = x[shape::getIndexOffset(e, xShape)];;
                         if (mVal < val) {
                             mIdx = static_cast<Z>(i);
                             mVal = val;
@@ -55,7 +55,7 @@ namespace nd4j {
                     }
                     __syncthreads();
 
-                    output[shape::getIndexOffset(e, outputShape, length)] = mIdx;
+                    output[shape::getIndexOffset(e, outputShape)] = mIdx;
                 }
             }
 
@@ -105,13 +105,13 @@ namespace nd4j {
                     for (int i = 0; i < numArrays; i++) {
                         auto x = reinterpret_cast<T*>(inArrs[i]);
                         auto xShape = reinterpret_cast<Nd4jLong *>(inShapes[i]);
-                        auto val = x[shape::getIndexOffset(e, xShape, length)];;
+                        auto val = x[shape::getIndexOffset(e, xShape)];;
                         if (mVal < val)
                             mVal = val;
                     }
                     __syncthreads();
 
-                    output[shape::getIndexOffset(e, outputShape, length)] = mVal;
+                    output[shape::getIndexOffset(e, outputShape)] = mVal;
                 }
             }
 
@@ -160,10 +160,10 @@ namespace nd4j {
                         auto x = reinterpret_cast<T*>(inArrs[i]);
                         auto xShape = reinterpret_cast<Nd4jLong *>(inShapes[i]);
 
-                        sum += x[shape::getIndexOffset(e, xShape, length)];
+                        sum += x[shape::getIndexOffset(e, xShape)];
                     }
 
-                    output[shape::getIndexOffset(e, outputShape, length)] = sum / numArrays;
+                    output[shape::getIndexOffset(e, outputShape)] = sum / numArrays;
                 }
             }
 
@@ -213,10 +213,10 @@ namespace nd4j {
                         auto x = reinterpret_cast<T*>(inArrs[i]);
                         auto xShape = reinterpret_cast<Nd4jLong *>(inShapes[i]);
 
-                        sum += x[shape::getIndexOffset(e, xShape, length)];
+                        sum += x[shape::getIndexOffset(e, xShape)];
                     }
 
-                    output[shape::getIndexOffset(e, outputShape, length)] = sum;
+                    output[shape::getIndexOffset(e, outputShape)] = sum;
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu b/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu
index ea4a1e146..399447c9a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/meshgrid.cu
@@ -55,8 +55,8 @@ namespace helpers {
             }
         } else {
             for (int i = threadIdx.x; i < length; i += blockDim.x) {
-                auto xOffset = shape::getIndexOffset(i, xShapeInfo, length);
-                auto zOffset = shape::getIndexOffset(i, zShapeInfo, length);
+                auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+                auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 
                 z[zOffset] = x[xOffset];
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
index 3b80f3df9..50a5a4025 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/nth_element.cu
@@ -31,22 +31,21 @@ namespace helpers {
 
     template <typename T>
     static __global__ void fillUpElementKernel(void* outputBuffer, Nd4jLong* outputShapeInfo, void* inputBuffer, Nd4jLong* inputShapeInfo, Nd4jLong* pTadShape, Nd4jLong* pTadOffsets, Nd4jLong n) {
-        __shared__ Nd4jLong bufferLength, arrLen;
+        __shared__ Nd4jLong bufferLength;
 
         auto z = reinterpret_cast<T*>(outputBuffer);
         auto x = reinterpret_cast<T*>(inputBuffer);
 
-        if (threadIdx.x == 0) {
-            arrLen = shape::length(pTadShape);
+        if (threadIdx.x == 0)
             bufferLength = shape::length(outputShapeInfo);
-        }
+
         __syncthreads();
 
         const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         const auto step = gridDim.x * blockDim.x;
         for (int t = tid; t < bufferLength; t += step) {
             auto tX = x + pTadOffsets[t];
-            z[shape::getIndexOffset(t, outputShapeInfo, bufferLength)] = tX[shape::getIndexOffset(n, pTadShape, arrLen)]; //tX];
+            z[shape::getIndexOffset(t, outputShapeInfo)] = tX[shape::getIndexOffset(n, pTadShape)]; //tX];
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
index 53b983d09..c0d1d95dc 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/one_hot.cu
@@ -61,14 +61,14 @@ __global__ static void onehotCuda(const void *vx, const Nd4jLong *xShapeInfo, vo
 
     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-        shape::index2coords(zRank, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), i, zLen, coord);
-        const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coord, zRank);
+        shape::index2coords(i, zShapeInfo, coord);
+        const auto zOffset = shape::getOffset(zShapeInfo, coord);
         const auto depthCoord = coord[axis];
 
         for (uint j = axis; j < zRank - 1; ++j)
           coord[j] = coord[j + 1];
 
-        const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coord, xRank);
+        const auto xOffset = shape::getOffset(xShapeInfo, coord);
         const Nd4jLong idx = x[xOffset];
         z[zOffset] = depthCoord == idx ? on : off;
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
index e19ddcb1b..aede6243a 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/pad.cu
@@ -48,7 +48,7 @@ namespace nd4j {
                 auto z = reinterpret_cast<X*>(vz);
 
                 __shared__ int rank, rankMinusOne;
-                __shared__ Nd4jLong zLen, yLen, totalThreads, *coords, *xShape, *zShape, *xStride, *zStride, shift1, shift2, yStride0;
+                __shared__ Nd4jLong zLen, totalThreads, *coords, *xShape, *zShape, shift1, shift2, yStride0;
 
                 if (threadIdx.x == 0) {
                     extern __shared__ unsigned char shmem[];
@@ -56,12 +56,9 @@ namespace nd4j {
                     zLen     = shape::length(zShapeInfo);
                     xShape   = shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo));
                     zShape   = shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo));
-                    xStride  = shape::stride(const_cast<Nd4jLong*>(xShapeInfo));
-                    zStride  = shape::stride(const_cast<Nd4jLong*>(zShapeInfo));
                     yStride0 = shape::stride(const_cast<Nd4jLong*>(yShapeInfo))[0];
                     rank     = shape::rank(xShapeInfo);
                     zLen     = shape::length(zShapeInfo);
-                    yLen     = 2 * rank;
                     rankMinusOne = rank - 1;
                     totalThreads = gridDim.x * blockDim.x;
                     shift1 = mode == 1 ? 0 : 1;         // REFLECT : SYMMETRIC
@@ -78,19 +75,19 @@ namespace nd4j {
 
                     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-                        shape::index2coords(rank, zShape, i, zLen, xzCoord);
-                        const auto zOffset = shape::getOffset(0, zShape, zStride, xzCoord, rank);
+                        shape::index2coords(i, zShapeInfo, xzCoord);
+                        const auto zOffset = shape::getOffset(zShapeInfo, xzCoord);
 
                         bool within = true;
                         for(int j = rankMinusOne; j >= 0; --j) {
                             if(xShape[j] == zShape[j]) continue;
-                            const auto left = y[shape::getIndexOffset(yStride0 * j, yShapeInfo, yLen)];
+                            const auto left = y[shape::getIndexOffset(yStride0 * j, yShapeInfo)];
                             if(xzCoord[j] < left || xzCoord[j] >= left + xShape[j]) {within = false; break;}
                             else                                                    {xzCoord[j] = xzCoord[j] - left;}
                         }
 
                         if(within)
-                            z[zOffset] = x[shape::getOffset(0, xShape, xStride, xzCoord, rank)];
+                            z[zOffset] = x[shape::getOffset(xShapeInfo, xzCoord)];
                         else
                             z[zOffset] = padVal;
                     }
@@ -99,18 +96,18 @@ namespace nd4j {
 
                     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-                        shape::index2coords(rank, zShape, i, zLen, xzCoord);
-                        const auto zOffset = shape::getOffset(0, zShape, zStride, xzCoord, rank);
+                        shape::index2coords(i, zShapeInfo, xzCoord);
+                        const auto zOffset = shape::getOffset(zShapeInfo, xzCoord);
 
                         for(int j = rankMinusOne; j >= 0; --j) {
 
                             if(xShape[j] == zShape[j]) continue;
-                            xzCoord[j] = xzCoord[j] - y[shape::getIndexOffset(yStride0 * j, yShapeInfo, yLen)];    // are ready to fill middle (within input dimension range)
+                            xzCoord[j] = xzCoord[j] - y[shape::getIndexOffset(yStride0 * j, yShapeInfo)];    // are ready to fill middle (within input dimension range)
                             if(xzCoord[j] < 0)               xzCoord[j] = -xzCoord[j] - shift1;                // means fill from left
                             else if(xzCoord[j] >= xShape[j]) xzCoord[j] = 2 * xShape[j] - xzCoord[j] - shift2; // means fill from right
                         }
 
-                        const auto xOffset = shape::getOffset(0, xShape, xStride, xzCoord, rank);
+                        const auto xOffset = shape::getOffset(xShapeInfo, xzCoord);
                         z[zOffset] = x[xOffset];
                     }
                 }
@@ -164,14 +161,14 @@ namespace nd4j {
                 auto step = blockDim.x * gridDim.x;
 
                 for(int i = start; i < zLen; i+= step) {
-                    auto zIndex = shape::getIndexOffset(i, zShape, zLen);
-                    auto xIndex = shape::getIndexOffset(len - i, xShape, xLen);
+                    auto zIndex = shape::getIndexOffset(i, zShape);
+                    auto xIndex = shape::getIndexOffset(len - i, xShape);
 
                     if (i < leftSide)                                   // left side
-                        xIndex = shape::getIndexOffset(leftSideCorrected - i, xShape, xLen);
+                        xIndex = shape::getIndexOffset(leftSideCorrected - i, xShape);
 
                     else if(i >= leftSide && i < leftSide + xLen)       // middle
-                        xIndex = shape::getIndexOffset(i - leftSide, xShape, xLen);
+                        xIndex = shape::getIndexOffset(i - leftSide, xShape);
 
 //            else                                                // right side
 //                z[i] = x[len - i];
@@ -187,8 +184,6 @@ namespace nd4j {
                 __shared__ I const* pads;
                 __shared__ F* z;
                 __shared__ Nd4jLong zRank, rank;
-                __shared__ Nd4jLong* xShapeOf, *xStrideOf, *padsShapeOf, *padsStrideOf;
-                __shared__ Nd4jLong* zShapeOf, *zStrideOf;
                 __shared__ Nd4jLong* xIdx;
                 if (threadIdx.x == 0) {
                     extern __shared__ unsigned char shmem[];
@@ -198,13 +193,6 @@ namespace nd4j {
                     x = reinterpret_cast<F const*>(vx);//
                     pads = reinterpret_cast<I const*>(paddings);
                     z = reinterpret_cast<F*>(vz);
-                    xShapeOf = shape::shapeOf(xShape);
-                    xStrideOf = shape::stride(xShape);
-                    zShapeOf = shape::shapeOf(zShape);
-                    zRank = shape::rank(zShape);
-                    zStrideOf = shape::stride(zShape);
-                    padsShapeOf = shape::shapeOf(paddingShape);
-                    padsStrideOf = shape::stride(paddingShape);
                 }
                 __syncthreads();
                 auto start = threadIdx.x + blockIdx.x * blockDim.x;
@@ -214,14 +202,14 @@ namespace nd4j {
                     auto xzCoord = xIdx + threadIdx.x * rank;
                     //auto zxCoord = xIdx + (threadIdx.x + threadIdx.x % 2 + 1) * rank;
 
-                    shape::index2coords(rank, zShapeOf, i, xzCoord);
-                    auto outOffset = shape::getOffset(0, zShapeOf, zStrideOf, xzCoord, rank);
+                    shape::index2coords(i, zShape, xzCoord);
+                    auto outOffset = shape::getOffset(zShape, xzCoord);
 //                auto intStep = blockDim.y * gridDim.y;
                     for(int j = 0; j < rank; j++) {
 
                         const Nd4jLong inLen         = shape::sizeAt(xShape, j);
                         Nd4jLong coords[2] = {j, 0};
-                        auto padOffset = shape::getOffset(0, padsShapeOf, padsStrideOf, coords, 2); // padding already has rank 2
+                        auto padOffset = shape::getOffset(paddingShape, coords); // padding already has rank 2
                         const auto leftSide          = pads[padOffset];
                         const auto leftSideCorrected = leftSide - reflBorder;
                         const Nd4jLong len           = 2 * (inLen - 1) + leftSide + reflBorder;
@@ -238,7 +226,7 @@ namespace nd4j {
                             xzCoord[j] = xzCoord[j] - len;
                     }
 
-                    auto inOffset  = shape::getOffset(0, xShapeOf, xStrideOf,  xzCoord,  rank);
+                    auto inOffset  = shape::getOffset(xShape, xzCoord);
                     z[outOffset] = x[inOffset];
                 }
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
index 7b325eb3e..ccfbbf943 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/percentile.cu
@@ -43,8 +43,8 @@ namespace helpers {
                         for (int tid = threadIdx.x; tid < tadLength; tid += blockDim.x) {
                             auto top = 2 * tid + 1;
                             if (top < tadLength) {
-                                auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo, tadLength);
-                                auto t1 = shape::getIndexOffset(top, xTadShapeInfo, tadLength);
+                                auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo);
+                                auto t1 = shape::getIndexOffset(top, xTadShapeInfo);
 
                                 if (x[t0] > x[t1]) {
                                     //swap values
@@ -58,8 +58,8 @@ namespace helpers {
                         for (int tid = threadIdx.x; tid < tadLength; tid += blockDim.x) {
                             auto top = 2 * tid + 2;
                             if (top < tadLength) {
-                                auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo, tadLength);
-                                auto t1 = shape::getIndexOffset(top, xTadShapeInfo, tadLength);
+                                auto t0 = shape::getIndexOffset(top - 1, xTadShapeInfo);
+                                auto t1 = shape::getIndexOffset(top, xTadShapeInfo);
 
                                 if (x[t0] > x[t1]) {
                                     //swap values
@@ -76,7 +76,7 @@ namespace helpers {
 
             // saving final value
             if (threadIdx.x == 0)
-                z[shape::getIndexOffset(t, zShapeInfo, zLength)] = x[shape::getIndexOffset(position, xTadShapeInfo, tadLength)];
+                z[shape::getIndexOffset(t, zShapeInfo)] = x[shape::getIndexOffset(position, xTadShapeInfo)];
             __syncthreads();
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
index bddaf65e3..01b9464fa 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/polyGamma.cu
@@ -34,31 +34,31 @@ __global__ static void polyGammaCuda(const void *vn, const Nd4jLong *nShapeInfo,
 
     const auto n = reinterpret_cast<const T*>(vn);
     const auto x = reinterpret_cast<const T*>(vx);
-          auto z = reinterpret_cast<T*>(vz);    
+          auto z = reinterpret_cast<T*>(vz);
 
     __shared__ Nd4jLong len;
-    
-    if (threadIdx.x == 0)         
-        len = shape::length(nShapeInfo);    
+
+    if (threadIdx.x == 0)
+        len = shape::length(nShapeInfo);
     __syncthreads();
 
     const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
     const auto totalThreads = gridDim.x * blockDim.x;
 
     for (int i = tid; i < len; i += totalThreads) {
-        
-        const auto nOffset = shape::getIndexOffset(i, nShapeInfo, len);    
-        const auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);        
-        const auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
 
-        const T nVal = n[nOffset]; 
-        
+        const auto nOffset = shape::getIndexOffset(i, nShapeInfo);
+        const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+        const auto zOffset = shape::getIndexOffset(i, zShapeInfo);
+
+        const T nVal = n[nOffset];
+
         int sign = (static_cast<int>(nVal) + 1) % 2  ?  -1 : 1;
 
         T factorial = 1;
         if(nVal != 0 && nVal != 1)
         	for(int i = 2; i <= nVal; ++i)
-				factorial *= i;	
+				factorial *= i;
 
         z[zOffset] = sign * factorial * zetaScalar<T>(nVal + 1, x[xOffset]);
     }
@@ -75,10 +75,10 @@ static void polyGammaCudaLauncher(const int blocksPerGrid, const int threadsPerB
 void polyGamma(nd4j::LaunchContext * context, const NDArray& n, const NDArray& x, NDArray& z) {
 
     NDArray::prepareSpecialUse({&z}, {&n, &x});
-        
+
     int threadsPerBlock = MAX_NUM_THREADS;
     int blocksPerGrid = (z.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
-    
+
     BUILD_SINGLE_SELECTOR(n.dataType(), polyGammaCudaLauncher, (blocksPerGrid, threadsPerBlock, context->getCudaStream(), n.getSpecialBuffer(), n.getSpecialShapeInfo(), x.getSpecialBuffer(), x.getSpecialShapeInfo(), z.getSpecialBuffer(), z.getSpecialShapeInfo()), FLOAT_TYPES);
 
     NDArray::registerSpecialUse({&z}, {&n, &x});
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
index b1412343b..52dd8b815 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/prefix.cu
@@ -68,12 +68,12 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
         }
 
         if(leftArrInd < tadLen)
-            shared[sharedInd] = xLeft = xTad[shape::getIndexOffset(leftArrInd, xTadShapeInfo, tadLen)];
+            shared[sharedInd] = xLeft = xTad[shape::getIndexOffset(leftArrInd, xTadShapeInfo)];
         // else
         //     shared[sharedInd] = (op == scalar::Add) ? 0 : 1;
 
         if(rightArrInd < tadLen)
-            shared[sharedInd + 1] = xRight = xTad[shape::getIndexOffset(rightArrInd, xTadShapeInfo, tadLen)];
+            shared[sharedInd + 1] = xRight = xTad[shape::getIndexOffset(rightArrInd, xTadShapeInfo)];
         // else
         //     shared[sharedInd + 1] = (op == scalar::Add) ? 0 : 1;
 
@@ -117,7 +117,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
                 result = (op == scalar::Add) ? result + xLeft : result * xLeft;
             if(i > 0)
                 result = (op == scalar::Add) ? result + lastElemInChunk : result * lastElemInChunk;
-            zTad[shape::getIndexOffset(leftArrInd, zTadShapeInfo, tadLen)] = result;
+            zTad[shape::getIndexOffset(leftArrInd, zTadShapeInfo)] = result;
         }
 
         if(rightArrInd < tadLen) {
@@ -128,7 +128,7 @@ __global__ static void prefixPerBlockCuda(scalar::Ops op,
                 result = (op == scalar::Add) ? result + lastElemInChunk : result * lastElemInChunk;
             if(i < numTadChunks - 1 && threadIdx.x == blockDim.x - 1)    // last element in chunk
                 lastElemInChunk = !exclusive ? result : (op == scalar::Add) ? result + xRight : result * xRight;
-            zTad[shape::getIndexOffset(rightArrInd, zTadShapeInfo, tadLen)] = result;
+            zTad[shape::getIndexOffset(rightArrInd, zTadShapeInfo)] = result;
         }
     }
 }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
index e86cd382a..aceebf7a0 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu
@@ -34,14 +34,12 @@ namespace helpers {
     static __global__ void reverseArrayKernel(void* input, Nd4jLong *inputShape, void* output, Nd4jLong *outputShape, Nd4jLong numOfElemsToReverse) {
         const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
         const auto step = gridDim.x * blockDim.x;
-        __shared__ Nd4jLong length;
         __shared__ int linearStatus;
         __shared__ T* inputArr;
         __shared__ T* outputArr;
         __shared__ char inputOrder, outputOrder;
 
         if (threadIdx.x == 0) {
-            length = shape::length(inputShape);
             linearStatus = (shape::elementWiseStride(inputShape) == shape::elementWiseStride(outputShape)) && (inputOrder == outputOrder)? shape::elementWiseStride(inputShape):0;
 
             char inputOrder = shape::order(inputShape);
@@ -56,31 +54,28 @@ namespace helpers {
 
         for (Nd4jLong e = tid; e < limit; e += step) {
             // we're calculating offsets within input array
-            auto fOffset = shape::getIndexOffset(e, inputShape, length);
-            auto lOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, inputShape, length);
+            auto fOffset = shape::getIndexOffset(e, inputShape);
+            auto lOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, inputShape);
 
             // now we're storing input values
             auto v1 = inputArr[fOffset];
             auto v2 = inputArr[lOffset];
 
             // now we're calculating offsets within output array
-            auto zfOffset = shape::getIndexOffset(e, outputShape, length);
-            auto zlOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, outputShape, length);
+            auto zfOffset = shape::getIndexOffset(e, outputShape);
+            auto zlOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, outputShape);
 
             // and saving values to output arrays
             outputArr[zfOffset] = v2;
             outputArr[zlOffset] = v1;
-
-            //printf("TID: %i; E: %lld; z[%lld], z[%lld] = x[%lld], x[%lld];\n", tid, e, zfOffset, zlOffset, lOffset, fOffset);
         }
 
         // in case of odd array we'll have to move middle value
         if (odd && tid == 0) {
-            auto xOffset = shape::getIndexOffset(limit, inputShape, length);
-            auto zOffset = shape::getIndexOffset(limit, outputShape, length);
+            auto xOffset = shape::getIndexOffset(limit, inputShape);
+            auto zOffset = shape::getIndexOffset(limit, outputShape);
 
             outputArr[zOffset] = inputArr[xOffset];
-            //printf("TID: %i; E: %lld; z[%lld] = x[%lld];\n", tid, limit, zOffset, xOffset);
         }
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/roll.cu b/libnd4j/include/ops/declarable/helpers/cuda/roll.cu
index 216c6b7a0..d843feeff 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/roll.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/roll.cu
@@ -53,11 +53,11 @@ namespace helpers {
             for (int i = tid; i < actualShift; i += blockDim.x * gridDim.x) {
                 int sourceIndex = fullLength - actualShift + i;
 
-                auto xOffsetA = shape::getIndexOffset(i, xShapeInfo, fullLength);
-                auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo, fullLength);
+                auto xOffsetA = shape::getIndexOffset(i, xShapeInfo);
+                auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo);
 
-                auto zOffsetA = shape::getIndexOffset(i, zShapeInfo, fullLength);
-                auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo, fullLength);
+                auto zOffsetA = shape::getIndexOffset(i, zShapeInfo);
+                auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo);
 
                 auto eA = x[xOffsetA];
                 auto eB = x[xOffsetB];
@@ -107,11 +107,11 @@ namespace helpers {
                     int destinationIndex = fullLength - (count + 1) * actualShift + i;
                     int sourceIndex = fullLength - count * actualShift + i;
 
-                    auto xOffsetA = shape::getIndexOffset(destinationIndex, xShapeInfo, fullLength);
-                    auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo, fullLength);
+                    auto xOffsetA = shape::getIndexOffset(destinationIndex, xShapeInfo);
+                    auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo);
 
-                    auto zOffsetA = shape::getIndexOffset(destinationIndex, zShapeInfo, fullLength);
-                    auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo, fullLength);
+                    auto zOffsetA = shape::getIndexOffset(destinationIndex, zShapeInfo);
+                    auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo);
 
                     auto eA = x[xOffsetA];
                     auto eB = x[xOffsetB];
@@ -154,11 +154,11 @@ namespace helpers {
                 int remainIdx = i + actualShift;
                 int sourceIndex = remainIdx + remainShift;
 
-                auto xOffsetA = shape::getIndexOffset(remainIdx, xShapeInfo, fullLength);
-                auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo, fullLength);
+                auto xOffsetA = shape::getIndexOffset(remainIdx, xShapeInfo);
+                auto xOffsetB = shape::getIndexOffset(sourceIndex, xShapeInfo);
 
-                auto zOffsetA = shape::getIndexOffset(remainIdx, zShapeInfo, fullLength);
-                auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo, fullLength);
+                auto zOffsetA = shape::getIndexOffset(remainIdx, zShapeInfo);
+                auto zOffsetB = shape::getIndexOffset(sourceIndex, zShapeInfo);
 
                 auto eA = x[xOffsetA];
                 auto eB = x[xOffsetB];
@@ -190,7 +190,7 @@ namespace helpers {
             }
         } else {
             for (int e = threadIdx.x; e < tadLength; e += blockDim.x) {
-                auto zOffset = shape::getIndexOffset(e, zShapeInfo, tadLength);
+                auto zOffset = shape::getIndexOffset(e, zShapeInfo);
 
                 auto eA = x[zOffset];
                 auto eB = z[zOffset];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
index 0ac0a1882..82f421fdd 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/s_t_b.cu
@@ -62,14 +62,14 @@ __global__ static void batchToSpaceCuda(const void* vx, const Nd4jLong* xShapeIn
     if(i >= zLen)
         return;
 
-    shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+    shape::index2coords(i, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     coords[1] += cropBottom;
     coords[2] += cropLeft;
 
-    const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+    const auto xOffset = shape::getOffset(xShapeInfo, coords);
 
     z[zOffset] = x[xOffset];
 
@@ -156,9 +156,9 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape
 
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < zLen; i += gridDim.x * blockDim.x) {
 
-        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         // evaluate spatial coordinates for x
         for(uint j = 1; j <= numOfSpatialDims; ++j) {
@@ -166,7 +166,7 @@ __global__ static void batchToSpaceNDCuda(const void* vx, const Nd4jLong* xShape
             coords[j] += y[yOffset];                        // add crop left
         }
 
-        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+        const auto xOffset = shape::getOffset(xShapeInfo, coords);
 
         z[zOffset] = x[xOffset];
     }
@@ -283,16 +283,16 @@ __global__ static void spaceToBatchCuda(const void* vx, const Nd4jLong* xShapeIn
     if(i >= zLen)
         return;
 
-    shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+    shape::index2coords(i, zShapeInfo, coords);
 
-    const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+    const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
     if(coords[1] >= padBottom && coords[1] < zShapeInfo[2] - padTop && coords[2] >= padLeft && coords[2] < zShapeInfo[3] - padRight) {
 
         coords[1] -= padBottom;
         coords[2] -= padLeft;
 
-        const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
+        const auto xOffset = shape::getOffset(xShapeInfo, coords);
 
         z[zOffset] = x[xOffset];
     }
@@ -383,9 +383,9 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape
 
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < zLen; i += totalThreads) {
 
-        shape::index2coords(rank, zShapeInfo + 1, i, zLen, coords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         bool within = true;
 
@@ -405,7 +405,7 @@ __global__ static void spaceToBatchNDCuda(const void* vx, const Nd4jLong* xShape
         }
 
         if(within)
-            z[zOffset] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+            z[zOffset] = x[shape::getOffset(xShapeInfo, coords)];
         else
             z[zOffset] = 0.f;
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
index 54d350f47..501b9bca4 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter.cu
@@ -57,8 +57,8 @@ namespace helpers {
 
             //             for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
 
-            //                 const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX);
-            //                 const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY);
+            //                 const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+            //                 const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
 
             //                 switch (opCode) {
             //                     case pairwise::Add:
@@ -99,8 +99,8 @@ namespace helpers {
             //             __syncthreads();
 
             //             for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
-            //                 const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX);
-            //                 const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY);
+            //                 const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+            //                 const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
 
             //                 switch (opCode) {
             //                     case pairwise::Add:
@@ -188,7 +188,7 @@ __global__ static void scatterLockCuda(const int opCode,
 
     for (int e = 0; e < xLen; e++) {
 
-        const Nd4jLong zIndex = x[shape::getIndexOffset(e, xShapeInfo, xLen)];
+        const Nd4jLong zIndex = x[shape::getIndexOffset(e, xShapeInfo)];
         const bool isOwner = zIndex < gridDim.x ? blockIdx.x == zIndex : blockIdx.x == zIndex % gridDim.x;
 
         if (!isOwner)
@@ -199,8 +199,8 @@ __global__ static void scatterLockCuda(const int opCode,
             if(threadIdx.x != 0)
                 continue;
 
-            const auto yOffset = shape::getIndexOffset(e,      yTadShapeInfo, yTadLen);
-            const auto zOffset = shape::getIndexOffset(zIndex, zTadShapeInfo, zTadLen);
+            const auto yOffset = shape::getIndexOffset(e,      yTadShapeInfo);
+            const auto zOffset = shape::getIndexOffset(zIndex, zTadShapeInfo);
 
             switch (opCode) {
                 case pairwise::Add:
@@ -241,8 +241,8 @@ __global__ static void scatterLockCuda(const int opCode,
 
             for (Nd4jLong i = threadIdx.x; i < zTadLen; i += blockDim.x) {
 
-                const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, zTadLen);
-                const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, zTadLen);
+                const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
+                const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo);
 
                 switch (opCode) {
                     case pairwise::Add:
@@ -326,19 +326,19 @@ __global__ static void scatterCuda(const int opCode,
 
     for (Nd4jLong i = tid; i < yLen; i += totalThreads) {
 
-        shape::index2coords(yRank, shape::shapeOf(const_cast<Nd4jLong*>(yShapeInfo)), i, yLen, yCoord);
+        shape::index2coords(i, yShapeInfo, yCoord);
 
         for (uint j = 0; j < xRank; ++j)
             xCoord[j] = yCoord[j];
 
-        const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), xCoord, xRank);
+        const auto xOffset = shape::getOffset(xShapeInfo, xCoord);
         zCoord[0] = x[xOffset];
 
         for (uint j = 0; j < yRank - xRank; ++j)
             zCoord[j + 1] = yCoord[xRank + j];
 
-        const auto yOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(yShapeInfo)), shape::stride(const_cast<Nd4jLong*>(yShapeInfo)), yCoord, yRank);
-        const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), zCoord, zRank);
+        const auto yOffset = shape::getOffset(yShapeInfo, yCoord);
+        const auto zOffset = shape::getOffset(zShapeInfo, zCoord);
 
         switch (opCode) {
             case pairwise::Add:
@@ -471,9 +471,9 @@ __global__ static void scatterNDLockCuda(const int opCode,
         const X* xTad = x + xOffsets[i];
 
         for (uint k = 0; k < xLastDim; ++k)
-            zTadCoordsPerThread[k] = xTad[shape::getIndexOffset(k, xTadShapeInfo, xLastDim)];
+            zTadCoordsPerThread[k] = xTad[shape::getIndexOffset(k, xTadShapeInfo)];
 
-        const auto zTadIndex = shape::coords2index(xLastDim, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), zTadCoordsPerThread);
+        const auto zTadIndex = shape::coords2index(xLastDim, zShapeInfo + 1, zTadCoordsPerThread);
 
         const bool isOwner = zTadIndex < gridDim.x ? blockIdx.x == zTadIndex : blockIdx.x == zTadIndex % gridDim.x;
 
@@ -485,8 +485,8 @@ __global__ static void scatterNDLockCuda(const int opCode,
             if(threadIdx.x != 0)
                 continue;
 
-            const auto yOffset = shape::getIndexOffset(i,         yTadShapeInfo, yTadLen);
-            const auto zOffset = shape::getIndexOffset(zTadIndex, zTadShapeInfo, yTadLen);
+            const auto yOffset = shape::getIndexOffset(i,         yTadShapeInfo);
+            const auto zOffset = shape::getIndexOffset(zTadIndex, zTadShapeInfo);
 
             switch (opCode) {
                 case pairwise::Add:
@@ -526,8 +526,8 @@ __global__ static void scatterNDLockCuda(const int opCode,
 
             for (Nd4jLong j = threadIdx.x; j < yTadLen; j += blockDim.x) {
 
-                const auto yOffset = shape::getIndexOffset(j, yTadShapeInfo, yTadLen);
-                const auto zOffset = shape::getIndexOffset(j, zTadShapeInfo, yTadLen);
+                const auto yOffset = shape::getIndexOffset(j, yTadShapeInfo);
+                const auto zOffset = shape::getIndexOffset(j, zTadShapeInfo);
 
                 switch (opCode) {
                     case pairwise::Add:
@@ -618,22 +618,22 @@ __global__ static void scatterNDCuda(const int opCode,
 
     for (Nd4jLong i = tid; i < yLen; i += totalThreads) {
 
-        shape::index2coords(yRank, shape::shapeOf(const_cast<Nd4jLong*>(yShapeInfo)), i, yLen, yCoord);
+        shape::index2coords(i, yShapeInfo, yCoord);
 
         for (uint j = 0; j < xRank - 1; ++j)
             xCoord[j] = yCoord[j];
 
         for (uint j = 0; j < xLastDim; ++j) {
             xCoord[xRank - 1] = j;
-            const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), xCoord, xRank);
+            const auto xOffset = shape::getOffset(xShapeInfo, xCoord);
             zCoord[j] = x[xOffset];
         }
 
         for (uint j = xLastDim; j < zRank; ++j)
             zCoord[j] = yCoord[yRank - zRank + j];
 
-        const auto yOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(yShapeInfo)), shape::stride(const_cast<Nd4jLong*>(yShapeInfo)), yCoord, yRank);
-        const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), zCoord, zRank);
+        const auto yOffset = shape::getOffset(yShapeInfo, yCoord);
+        const auto zOffset = shape::getOffset(zShapeInfo, zCoord);
 
         switch (opCode) {
             case pairwise::Add:
@@ -760,18 +760,18 @@ __global__ void scatterForLossCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
     auto coords = sharedMem + threadIdx.x * (xRank + 1);
 
-    shape::index2coords(xRank, xShapeInfo + 1, xInd, xLen, coords);
+    shape::index2coords(xInd, xShapeInfo, coords);
 
     // y last coordinate
-    coords[xRank] = x[shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, coords, xRank)];
+    coords[xRank] = x[shape::getOffset(xShapeInfo, coords)];
 
-    const auto yOffset = shape::getOffset(0, yShapeInfo + 1, yShapeInfo + xRank + 2, coords, xRank + 1);
+    const auto yOffset = shape::getOffset(yShapeInfo, coords);
 
     if(z == nullptr) { // gradient calculation
         y[yOffset] -= 1.f;
     }
     else {
-        z[shape::getOffset(0, zShapeInfo + 1, zShapeInfo + xRank + 1, coords, xRank)] = y[yOffset];
+        z[shape::getOffset(zShapeInfo, coords)] = y[yOffset];
     }
 }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu
index f1eda6b01..37a465144 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_simple.cu
@@ -40,9 +40,9 @@ namespace nd4j {
                 auto tid = threadIdx.x + blockIdx.x * blockDim.x;
                 for (int i = tid; i < iLength; i += blockDim.x * gridDim.x) {
                     auto x = reinterpret_cast<X*>(vx) + xTadOffsets[i];
-                    auto idx = indices[shape::getIndexOffset(i, iShapeInfo, iLength)];
+                    auto idx = indices[shape::getIndexOffset(i, iShapeInfo)];
 
-                    x[shape::getIndexOffset(idx, xTadShape, xLength)] = u[shape::getIndexOffset(i, uShapeInfo, uLength)];
+                    x[shape::getIndexOffset(idx, xTadShape)] = u[shape::getIndexOffset(i, uShapeInfo)];
                 }
             }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
index d8b3575ff..1ad55a111 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/scatter_update.cu
@@ -63,8 +63,8 @@ namespace nd4j {
 
                     for (Nd4jLong i = threadIdx.x; i < arrLenX; i += blockDim.x) {
 
-                        const auto xOffset = shape::getIndexOffset(i, xShapeInfo, arrLenX);
-                        const auto yOffset = shape::getIndexOffset(i, yShapeInfo, arrLenY);
+                        const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+                        const auto yOffset = shape::getIndexOffset(i, yShapeInfo);
 
                         switch (opCode) {
                             case 0:
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
index 8830f37e7..cab6e50e7 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_max.cu
@@ -58,10 +58,10 @@ namespace nd4j {
                     zLen = shape::length(outputShape);
 
                     if (segment < numOfClasses) {
-                        zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                        zIndex = shape::getIndexOffset(segment, outputShape);
                         start = starts[segment];
                         finish = start + lengths[segment];
-                        z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                        z[zIndex] = x[shape::getIndexOffset(start, inputShape)];
                         val[segment] = z[zIndex];
                     }
 
@@ -69,7 +69,7 @@ namespace nd4j {
                 __syncthreads();
 
                 for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+                    auto xIndex = shape::getIndexOffset(e, inputShape);
                     nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                 }
             }
@@ -94,19 +94,19 @@ namespace nd4j {
                     xLen = shape::length(inputShape);
                     zLen = shape::length(outputShape);
 
-                    zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                    zIndex = shape::getIndexOffset(segment, outputShape);
                     //start = starts[segment];
                     //finish = start + lengths[segment];
                     if (lengths[segment] > 0)
-                        z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+                        z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)];
                     else
                         z[zIndex] = -DataTypeUtils::max<T>();
                 }
                 __syncthreads();
                 if (lengths[segment] > 0)
                     for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
-                        auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                        auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                        auto xIndex = shape::getIndexOffset(e, inputShape);
+                        auto yIndex = shape::getIndexOffset(e, indicesShape);
                         if (y[yIndex] == segment) {
                             nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                         }
@@ -140,16 +140,16 @@ namespace nd4j {
                     auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
                     if (blockIdx.x == start) {
                         for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                            auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                            auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                            auto xIndex = shape::getIndexOffset(e, inputTads);
+                            auto zIndex = shape::getIndexOffset(e, outputTads);
                             nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                             //z[zIndex] = x[xIndex];
                         }
                     }
                     else {
                         for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                            auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                            auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                            auto xIndex = shape::getIndexOffset(e, inputTads);
+                            auto zIndex = shape::getIndexOffset(e, outputTads);
                             if (lengths[segment])
                                 nd4j::math::atomics::nd4j_atomicMax(&z[zIndex], x[xIndex]);
                         }
@@ -276,12 +276,12 @@ namespace nd4j {
 
                 for (auto e = start; e < xLen; e += step) {
 
-                    auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-                    auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-                    auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+                    auto zOffset = shape::getIndexOffset(e, outputShape);
+                    auto xOffset = shape::getIndexOffset(e, inputShape);
+                    auto yOffset = shape::getIndexOffset(e, indicesShape);
                     auto classIndex = y[yOffset];
-                    auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
-                    auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+                    auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape);
+                    auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
                     if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
                         z[zOffset] = gradOut[gradOffsetO];
@@ -318,7 +318,7 @@ namespace nd4j {
                 __syncthreads();
 
                 for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-                    auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+                    auto yIndex = shape::getIndexOffset(i, indicesShape);
                     auto segment = y[yIndex];
                     T* current = x + inputOffsets[i];
                     T* currentOut = z + outOffsets[i];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
index 19869f646..dc958f79c 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_mean.cu
@@ -53,11 +53,11 @@ namespace helpers {
 
             //[zIndex] =
             if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                zIndex = shape::getIndexOffset(segment, outputShape);
                 start = starts[segment];
                 finish = start + lengths[segment];
                 //val[segment] = ;
-                z[zIndex] = T(x[shape::getIndexOffset(start, inputShape, xLen)] / lengths[segment]);
+                z[zIndex] = T(x[shape::getIndexOffset(start, inputShape)] / lengths[segment]);
 //                val[segment] = z[zIndex];
             }
 
@@ -65,7 +65,7 @@ namespace helpers {
         __syncthreads();
 
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            auto xIndex = shape::getIndexOffset(e, inputShape);
             if (lengths[segment])
                 nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex] / lengths[segment]));
         }
@@ -91,11 +91,11 @@ namespace helpers {
             zLen = shape::length(outputShape);
 
 //            if (segment < numOfClasses) {
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            zIndex = shape::getIndexOffset(segment, outputShape);
             //start = starts[segment];
             //finish = start + lengths[segment];
             if (lengths[segment] > 0)
-                z[zIndex] = T(x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / T(lengths[segment]));
+                z[zIndex] = T(x[shape::getIndexOffset(starts[segment], inputShape)] / T(lengths[segment]));
             else
                 z[zIndex] = 0; //DataTypeUtils::max<T>();
 //                val[segment] = z[zIndex];
@@ -105,8 +105,8 @@ namespace helpers {
         __syncthreads();
         if (lengths[segment] > 0)
             for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                auto xIndex = shape::getIndexOffset(e, inputShape);
+                auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment && e != starts[segment]) {
                     nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/T(lengths[segment])));
                 }
@@ -137,15 +137,15 @@ namespace helpers {
             auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
             if (blockIdx.x == start) {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
                 }
             }
             else {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     if (lengths[segment])
                         nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], T(x[xIndex]/lengths[segment]));
                 }
@@ -261,11 +261,11 @@ namespace helpers {
 
         for (auto e = start; e < xLen; e += step) {
 
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto zOffset = shape::getIndexOffset(e, outputShape);
+            auto xOffset = shape::getIndexOffset(e, inputShape);
+            auto yOffset = shape::getIndexOffset(e, indicesShape);
             auto classIndex = y[yOffset];
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
             z[zOffset] = T(gradOut[gradOffsetO] / float(lengths[classIndex]));
         }
@@ -294,14 +294,14 @@ namespace helpers {
         __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-//            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+//            auto yIndex = shape::getIndexOffset(i, indicesShape);
             auto segment = y[i]; //yIndex];
             T* currentOut = z + outOffsets[i];
             T* outGrad = gradOut + gradOutOffsets[segment];
 
             for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                auto zIndex = shape::getIndexOffset(e, outTad, currentLen);
-                auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen);
+                auto zIndex = shape::getIndexOffset(e, outTad);
+                auto gradIndex = shape::getIndexOffset(e, gradOutTad);
                 if (lengths[segment] > 0)
                     currentOut[zIndex] = T(outGrad[gradIndex] / float(lengths[segment]));
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
index e5ea2eb91..506cfaa41 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_min.cu
@@ -56,10 +56,10 @@ namespace helpers {
             zLen = shape::length(outputShape);
 
             if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                zIndex = shape::getIndexOffset(segment, outputShape);
                 start = starts[segment];
                 finish = start + lengths[segment];
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                z[zIndex] = x[shape::getIndexOffset(start, inputShape)];
                 val[segment] = z[zIndex];
             }
 
@@ -67,7 +67,7 @@ namespace helpers {
         __syncthreads();
 
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            auto xIndex = shape::getIndexOffset(e, inputShape);
             nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
         }
 
@@ -98,9 +98,9 @@ namespace helpers {
             xLen = shape::length(inputShape);
             zLen = shape::length(outputShape);
 
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            zIndex = shape::getIndexOffset(segment, outputShape);
             if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)];
             else
                 z[zIndex] = DataTypeUtils::max<T>();
 
@@ -108,8 +108,8 @@ namespace helpers {
         __syncthreads();
         if (lengths[segment] > 0)
             for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                auto xIndex = shape::getIndexOffset(e, inputShape);
+                auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment) {
                     nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
                 }
@@ -140,15 +140,15 @@ namespace helpers {
             auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
             if (blockIdx.x == start) {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
                 }
             }
             else {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
 //                    if (lengths[indices[idx]])
                         nd4j::math::atomics::nd4j_atomicMin(&z[zIndex], x[xIndex]);
                 }
@@ -269,12 +269,12 @@ namespace helpers {
 
         for (auto e = start; e < xLen; e += step) {
 
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto zOffset = shape::getIndexOffset(e, outputShape);
+            auto xOffset = shape::getIndexOffset(e, inputShape);
+            auto yOffset = shape::getIndexOffset(e, indicesShape);
             auto classIndex = y[yOffset];
-            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
             if (nd4j::math::nd4j_abs(gradIn[gradOffsetI] - x[xOffset]) <= T(1.e-6)) {
                 z[zOffset] = gradOut[gradOffsetO];
@@ -311,7 +311,7 @@ namespace helpers {
         __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto yIndex = shape::getIndexOffset(i, indicesShape);
             auto segment = y[yIndex];
             T* current = x + inputOffsets[i];
             T* currentOut = z + outOffsets[i];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
index 5709a63ea..7814defe1 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_prod.cu
@@ -53,11 +53,11 @@ namespace helpers {
             zLen = shape::length(outputShape);
 
             if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                zIndex = shape::getIndexOffset(segment, outputShape);
                 start = starts[segment];
                 finish = start + lengths[segment];
                 //val[segment] = ;
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                z[zIndex] = x[shape::getIndexOffset(start, inputShape)];
                 val[segment] = z[zIndex];
             }
 
@@ -67,7 +67,7 @@ namespace helpers {
 //         auto step = blockDim.x * gridDim.x;
 
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            auto xIndex = shape::getIndexOffset(e, inputShape);
             nd4j::math::atomics::nd4j_atomicMul(&val[segment], x[xIndex]);
         }
         __syncthreads();
@@ -98,11 +98,11 @@ namespace helpers {
             zLen = shape::length(outputShape);
 
 //            if (segment < numOfClasses) {
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            zIndex = shape::getIndexOffset(segment, outputShape);
             //start = starts[segment];
             //finish = start + lengths[segment];
             if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)];
             else
                 z[zIndex] = 0; //DataTypeUtils::max<T>();
 //                val[segment] = z[zIndex];
@@ -112,8 +112,8 @@ namespace helpers {
         __syncthreads();
         if (lengths[segment] > 0)
             for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                auto xIndex = shape::getIndexOffset(e, inputShape);
+                auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment && e != starts[segment]) {
                     nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
                 }
@@ -144,15 +144,15 @@ namespace helpers {
             auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
             if (blockIdx.x == start) {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
                 }
             }
             else {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     if (lengths[segment] > 0)
                         nd4j::math::atomics::nd4j_atomicMul(&z[zIndex], x[xIndex]);
                 }
@@ -268,12 +268,12 @@ namespace helpers {
 
         for (auto e = start; e < xLen; e += step) {
 
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto zOffset = shape::getIndexOffset(e, outputShape);
+            auto xOffset = shape::getIndexOffset(e, inputShape);
+            auto yOffset = shape::getIndexOffset(e, indicesShape);
             auto classIndex = y[yOffset];
-            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape, gradLen);
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+            auto gradOffsetI = shape::getIndexOffset(classIndex, forwardShape);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
             z[zOffset] = gradOut[gradOffsetO]  * gradIn[gradOffsetI] / x[xOffset];
         }
@@ -307,7 +307,7 @@ namespace helpers {
         __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto yIndex = shape::getIndexOffset(i, indicesShape);
             auto segment = y[yIndex];
             T* current = x + inputOffsets[i];
             T* currentOut = z + outOffsets[i];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
index 229d41cc9..f4237ac44 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sqrtn.cu
@@ -51,11 +51,11 @@ namespace helpers {
             zLen = shape::length(outputShape);
 
 //            if (segment < numOfClasses) {
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            zIndex = shape::getIndexOffset(segment, outputShape);
             //start = starts[segment];
             //finish = start + lengths[segment];
             if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
             else
                 z[zIndex] = 0; //DataTypeUtils::max<T>();
 //                val[segment] = z[zIndex];
@@ -65,8 +65,8 @@ namespace helpers {
         __syncthreads();
         if (lengths[segment] > 0)
             for (auto e = threadIdx.x + 1; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                auto xIndex = shape::getIndexOffset(e, inputShape);
+                auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment && e != starts[segment]) {
                     nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
                 }
@@ -97,15 +97,15 @@ namespace helpers {
             auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
             if (blockIdx.x == start) {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     z[zIndex] = x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]);
                 }
             }
             else {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex] / nd4j::math::nd4j_sqrt<int, T>(lengths[segment]));
                 }
             }
@@ -177,11 +177,11 @@ namespace helpers {
 
         for (auto e = start; e < xLen; e += step) {
 
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto zOffset = shape::getIndexOffset(e, outputShape);
+            auto xOffset = shape::getIndexOffset(e, inputShape);
+            auto yOffset = shape::getIndexOffset(e, indicesShape);
             auto classIndex = y[yOffset];
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
             z[zOffset] = T(gradOut[gradOffsetO] / math::nd4j_sqrt<int, float>(lengths[classIndex]));
         }
@@ -211,14 +211,14 @@ namespace helpers {
         __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-//            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+//            auto yIndex = shape::getIndexOffset(i, indicesShape);
             auto segment = y[i]; //yIndex];
             T* currentOut = z + outOffsets[i];
             T* outGrad = gradOut + gradOutOffsets[segment];
 
             for (auto e = threadIdx.x; e < currentLen; e += blockDim.x) {
-                auto zIndex = shape::getIndexOffset(e, outTad, currentLen);
-                auto gradIndex = shape::getIndexOffset(e, gradOutTad, gradLen);
+                auto zIndex = shape::getIndexOffset(e, outTad);
+                auto gradIndex = shape::getIndexOffset(e, gradOutTad);
                 if (lengths[segment] > 0)
                     currentOut[zIndex] = T(outGrad[gradIndex] / math::nd4j_sqrt<int, float>(lengths[segment]));
             }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
index 4b8976f4e..cf4ddd942 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/segment_sum.cu
@@ -58,18 +58,18 @@ namespace helpers {
 
 
             if (segment < numOfClasses) {
-                zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+                zIndex = shape::getIndexOffset(segment, outputShape);
                 start = starts[segment];
                 finish = start + lengths[segment];
                 //val[segment] = ;
-                z[zIndex] = x[shape::getIndexOffset(start, inputShape, xLen)];
+                z[zIndex] = x[shape::getIndexOffset(start, inputShape)];
             }
 
         }
         __syncthreads();
 
         for (auto e = start + threadIdx.x + 1; e < finish; e += blockDim.x) {
-            auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
+            auto xIndex = shape::getIndexOffset(e, inputShape);
             nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
         }
     }
@@ -99,9 +99,9 @@ namespace helpers {
             xLen = shape::length(inputShape);
             zLen = shape::length(outputShape);
 
-            zIndex = shape::getIndexOffset(segment, outputShape, zLen);
+            zIndex = shape::getIndexOffset(segment, outputShape);
             if (lengths[segment] > 0)
-                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape, xLen)];
+                z[zIndex] = x[shape::getIndexOffset(starts[segment], inputShape)];
             else
                 z[zIndex] = 0; //DataTypeUtils::max<T>();
         }
@@ -109,8 +109,8 @@ namespace helpers {
 
         if (lengths[segment] > 0)
             for (auto e = threadIdx.x; e < xLen; e += blockDim.x) {
-                auto xIndex = shape::getIndexOffset(e, inputShape, xLen);
-                auto yIndex = shape::getIndexOffset(e, indicesShape, xLen);
+                auto xIndex = shape::getIndexOffset(e, inputShape);
+                auto yIndex = shape::getIndexOffset(e, indicesShape);
                 if (y[yIndex] == segment && e != starts[segment]) {
                     nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
                 }
@@ -141,15 +141,15 @@ namespace helpers {
             auto x = reinterpret_cast<T *>(inputBuf) + inputTadOffsets[idx];
             if (blockIdx.x == start) {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
                 }
             }
             else {
                 for (auto e = threadIdx.x; e < len; e += blockDim.x) {
-                    auto xIndex = shape::getIndexOffset(e, inputTads, len);
-                    auto zIndex = shape::getIndexOffset(e, outputTads, len);
+                    auto xIndex = shape::getIndexOffset(e, inputTads);
+                    auto zIndex = shape::getIndexOffset(e, outputTads);
                     if (lengths[indices[idx]])
                         nd4j::math::atomics::nd4j_atomicAdd(&z[zIndex], x[xIndex]);
                 }
@@ -269,11 +269,11 @@ namespace helpers {
 
         for (auto e = start; e < xLen; e += step) {
 
-            auto zOffset = shape::getIndexOffset(e, outputShape, xLen);
-            auto xOffset = shape::getIndexOffset(e, inputShape, xLen);
-            auto yOffset = shape::getIndexOffset(e, indicesShape, xLen);
+            auto zOffset = shape::getIndexOffset(e, outputShape);
+            auto xOffset = shape::getIndexOffset(e, inputShape);
+            auto yOffset = shape::getIndexOffset(e, indicesShape);
             auto classIndex = y[yOffset];
-            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape, gradLen);
+            auto gradOffsetO = shape::getIndexOffset(classIndex, epsShape);
 
             z[zOffset] = gradOut[gradOffsetO];
         }
@@ -302,7 +302,7 @@ namespace helpers {
         __syncthreads();
 
         for (auto i = blockIdx.x; i < yLen; i += gridDim.x) {
-            auto yIndex = shape::getIndexOffset(i, indicesShape, yLen);
+            auto yIndex = shape::getIndexOffset(i, indicesShape);
             auto segment = y[yIndex];
             T* currentOut = z + outOffsets[i];
             T* outGrad = gradOut + gradOutOffsets[segment];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
index 7318dbaea..c07db1b95 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sequence_mask.cu
@@ -40,8 +40,8 @@ namespace helpers {
 
         for (auto i = blockIdx.x; i < maxIndex; i += gridDim.x)
             for(auto k = threadIdx.x; k < inputLen; k += blockDim.x)
-                if (i < input[shape::getIndexOffset(k, inputShape, inputLen)])
-                    output[shape::getIndexOffset(k * maxIndex + i, outputShape, outputLen)] = B(true);
+                if (i < input[shape::getIndexOffset(k, inputShape)])
+                    output[shape::getIndexOffset(k * maxIndex + i, outputShape)] = B(true);
 
     }
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
index 5c00244f8..5ce883a59 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/sru.cu
@@ -157,11 +157,11 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
     if(tid >= len)
         return;
 
-    shape::index2coords(rank, xShapeInfo + 2, tid, len, coords + 1);    // loop through last two dimensions of x : {bS, 2*K}
+    shape::index2coords(tid, rank - 1, xShapeInfo + 2, coords + 1);    // loop through last two dimensions of x : {bS, 2*K}
 
-    const auto maskOffst = mask ? shape::getOffset(0, maskShapeInfo + 1, maskShapeInfo + rank, coords + 1, rank - 1) : 0;
-    const auto c0Offset  = shape::getOffset(0, c0ShapeInfo + 1, c0ShapeInfo + rank, coords + 1, rank - 1);
-    const auto bFOffset  = shape::getOffset(0, bShapeInfo + 1, bShapeInfo + rank - 1, coords + 2, rank - 2);
+    const auto maskOffst = mask ? shape::getOffset(maskShapeInfo, coords + 1) : 0;
+    const auto c0Offset  = shape::getOffset(c0ShapeInfo, coords + 1);
+    const auto bFOffset  = shape::getOffset(bShapeInfo, coords + 2);
     const auto bROffset  = bFOffset + 2 * K * bShapeInfo[2];    // 2*K*b_stride
 
     const T maskVal = mask ? mask[maskOffst] : static_cast<T>(1);
@@ -176,12 +176,12 @@ __global__ static void sruBICuda(const void* vx,    const Nd4jLong* xShapeInfo,
     else
         coords[0] = 0;
 
-    auto xOffset  = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
-    auto htOffset = shape::getOffset(0, htShapeInfo + 1, htShapeInfo + rank + 1, coords, rank);
-    auto ctOffset = shape::getOffset(0, ctShapeInfo + 1, ctShapeInfo + rank + 1, coords, rank);
+    auto xOffset  = shape::getOffset(xShapeInfo, coords);
+    auto htOffset = shape::getOffset(htShapeInfo, coords);
+    auto ctOffset = shape::getOffset(ctShapeInfo, coords);
 
     coords[2] *= 3;
-    auto wiOffset0 = shape::getOffset(0, wiShapeInfo + 1, wiShapeInfo + rank + 1, coords, rank);
+    auto wiOffset0 = shape::getOffset(wiShapeInfo, coords);
     auto wiOffset1 = wiOffset0 + wiShapeInfo[rank + 3];   // add last stride
     auto wiOffset2 = wiOffset1 + wiShapeInfo[rank + 3];   // add last stride
 
@@ -363,15 +363,15 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
     if(tid >= len)
         return;
 
-    shape::index2coords(rank, xShapeInfo + 2, tid, len, coords + 1);    // loop through last two dimensions of x : {bS, 2*K}
+    shape::index2coords(tid, rank - 1, xShapeInfo + 2, coords + 1);    // loop through last two dimensions of x : {bS, 2*K}
 
-    const auto maskOffst    = mask ? shape::getOffset(0, maskShapeInfo + 1, maskShapeInfo + rank, coords + 1, rank - 1) : 0;
-    const auto c0Offset     = shape::getOffset(0, c0ShapeInfo + 1, c0ShapeInfo + rank, coords + 1, rank - 1);
-    const auto gradCtOffset = shape::getOffset(0, gradCtShapeInfo + 1, gradCtShapeInfo + rank, coords + 1, rank - 1);
-    const auto gradC0Offset = shape::getOffset(0, gradC0ShapeInfo + 1, gradC0ShapeInfo + rank, coords + 1, rank - 1);
-    const auto bFOffset     = shape::getOffset(0, bShapeInfo + 1, bShapeInfo + rank - 1, coords + 2, rank - 2);
+    const auto maskOffst    = mask ? shape::getOffset(maskShapeInfo, coords + 1) : 0;
+    const auto c0Offset     = shape::getOffset(c0ShapeInfo, coords + 1);
+    const auto gradCtOffset = shape::getOffset(gradCtShapeInfo, coords + 1);
+    const auto gradC0Offset = shape::getOffset(gradC0ShapeInfo, coords + 1);
+    const auto bFOffset     = shape::getOffset(bShapeInfo, coords + 2);
     const auto bROffset     = bFOffset + 2 * K * bShapeInfo[2];         // 2*K*b_stride
-    // const auto gradBFOffset = shape::getOffset(0, gradBShapeInfo + 1, gradBShapeInfo + rank, coords + 1, rank - 1);
+    // const auto gradBFOffset = shape::getOffset(gradBShapeInfo, coords + 1);
     const auto gradBFOffset = coords[1] * gradBShapeInfo[3] / 2 + coords[2] * gradBShapeInfo[4];
     const auto gradBROffset = gradBFOffset + gradBShapeInfo[3];
 
@@ -382,16 +382,16 @@ __global__ static void sruBIBPCuda(const void* vx,       const Nd4jLong* xShapeI
     else
         coords[0] = time - 1;
 
-    auto xOffset      = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank);
-    auto ctOffset     = shape::getOffset(0, ctShapeInfo + 1, ctShapeInfo + rank + 1, coords, rank);
-    auto gradIOffset  = shape::getOffset(0, gradIShapeInfo + 1, gradIShapeInfo + rank + 1, coords, rank);
-    auto gradHtOffset = shape::getOffset(0, gradHtShapeInfo + 1, gradHtShapeInfo + rank + 1, coords, rank);
+    auto xOffset      = shape::getOffset(xShapeInfo, coords);
+    auto ctOffset     = shape::getOffset(ctShapeInfo, coords);
+    auto gradIOffset  = shape::getOffset(gradIShapeInfo, coords);
+    auto gradHtOffset = shape::getOffset(gradHtShapeInfo, coords);
 
     coords[2] *= 3;
-    auto gradWiOffset0 = shape::getOffset(0, gradWiShapeInfo + 1, gradWiShapeInfo + rank + 1, coords, rank);
+    auto gradWiOffset0 = shape::getOffset(gradWiShapeInfo, coords);
     auto gradWiOffset1 = gradWiOffset0 + gradWiShapeInfo[rank + 3];   // add last stride
     auto gradWiOffset2 = gradWiOffset1 + gradWiShapeInfo[rank + 3];   // add last stride
-    auto wiOffset0     = shape::getOffset(0, wiShapeInfo + 1, wiShapeInfo + rank + 1, coords, rank);
+    auto wiOffset0     = shape::getOffset(wiShapeInfo, coords);
     auto wiOffset1     = wiOffset0 + wiShapeInfo[rank + 3];   // add last stride
     auto wiOffset2     = wiOffset1 + wiShapeInfo[rank + 3];   // add last stride
 
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/stack.cu b/libnd4j/include/ops/declarable/helpers/cuda/stack.cu
index e492baf8e..e88f5ade8 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/stack.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/stack.cu
@@ -39,7 +39,7 @@ namespace helpers {
 		if(tadShape == nullptr) {	// scalar case
 
 			for (Nd4jLong i = blockIdx.x * blockDim.x + threadIdx.x; i < inputListLength; i += gridDim.x * blockDim.x)
-				z[shape::getIndexOffset(i, zShapeInfo, inputListLength)] = reinterpret_cast<T*>(inputList[i])[0];
+				z[shape::getIndexOffset(i, zShapeInfo)] = reinterpret_cast<T*>(inputList[i])[0];
 		}
 		else {
 
@@ -50,7 +50,7 @@ namespace helpers {
 			    auto xShapeInfo = reinterpret_cast<Nd4jLong*>(inputShapeList[t]);
 
 			    for (int e = threadIdx.x; e < arrLen; e += blockDim.x)
-			        tZ[shape::getIndexOffset(e, tadShape, arrLen)] = tX[shape::getIndexOffset(e, xShapeInfo, arrLen)];
+			        tZ[shape::getIndexOffset(e, tadShape)] = tX[shape::getIndexOffset(e, xShapeInfo)];
 			}
 		}
 	}
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/svd.cu b/libnd4j/include/ops/declarable/helpers/cuda/svd.cu
index 0695119da..b39ebf81b 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/svd.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/svd.cu
@@ -65,12 +65,12 @@ __global__ static void inverseColumnSignCuda(void* vu, const Nd4jLong* uShapeInf
     // u
     for (Nd4jLong i = ind; i < uLen; i += gridDim.x * blockDim.x) {
 
-        shape::index2coords(rank, uShapeInfo + 1, i, uLen, coords);
+        shape::index2coords(i, uShapeInfo, coords);
 
         if(coords[rank - 1] == 0 || coords[rank - 1] == uLastButOneColumn)   // do not change sign in first and last but one columns
             continue;
 
-        const auto uOffset = shape::getOffset(0, uShapeInfo + 1, uShapeInfo + rank + 1, coords, rank);
+        const auto uOffset = shape::getOffset(uShapeInfo, coords);
 
         u[uOffset] = -u[uOffset];
     }
@@ -78,12 +78,12 @@ __global__ static void inverseColumnSignCuda(void* vu, const Nd4jLong* uShapeInf
     // v
     for (Nd4jLong i = ind; i < vLen; i += gridDim.x * blockDim.x) {
 
-        shape::index2coords(rank, vShapeInfo + 1, i, vLen, coords);
+        shape::index2coords(i, vShapeInfo, coords);
 
         if(coords[rank - 2] == 0 || coords[rank - 2] == vLastButOneColumn)   // do not change sign in first and last but one columns
             continue;
 
-        const auto vOffset = shape::getOffset(0, vShapeInfo + 1, vShapeInfo + rank + 1, coords, rank);
+        const auto vOffset = shape::getOffset(vShapeInfo, coords);
 
         v[vOffset] = -v[vOffset];
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
index db6213dd3..972013835 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/top_k.cu
@@ -50,15 +50,15 @@ __global__ static void inTopKCuda(const void* vx, const Nd4jLong* xShapeInfo,
         xTadLen = shape::length(xTadShapeInfo);
 
         xTad = reinterpret_cast<const X*>(vx) + xTadOffsets[blockIdx.x];
-        idx = y[shape::getIndexOffset(blockIdx.x, yShapeInfo, shape::length(yShapeInfo))]; // shape::length(yShapeInfo) == numTads
-        elemToCompare = xTad[shape::getIndexOffset(idx, xTadShapeInfo, xTadLen)];
+        idx = y[shape::getIndexOffset(blockIdx.x, yShapeInfo)]; // shape::length(yShapeInfo) == numTads
+        elemToCompare = xTad[shape::getIndexOffset(idx, xTadShapeInfo)];
     }
 
     __syncthreads();
 
     sharedMem[threadIdx.x] = 0;
     for (Nd4jLong i = threadIdx.x; i < xTadLen; i += blockDim.x)
-        if(elemToCompare < xTad[shape::getIndexOffset(i, xTadShapeInfo, xTadLen)])
+        if(elemToCompare < xTad[shape::getIndexOffset(i, xTadShapeInfo)])
             ++sharedMem[threadIdx.x];
 
     __syncthreads();
@@ -71,7 +71,7 @@ __global__ static void inTopKCuda(const void* vx, const Nd4jLong* xShapeInfo,
     }
 
     if (threadIdx.x == 0)
-        z[shape::getIndexOffset(blockIdx.x, zShapeInfo, shape::length(zShapeInfo))] = *sharedMem < k;
+        z[shape::getIndexOffset(blockIdx.x, zShapeInfo)] = *sharedMem < k;
 }
 
 ///////////////////////////////////////////////////////////////////
@@ -117,9 +117,9 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
             auto z = reinterpret_cast<X*>(vz) + zTadOffsets[t];
 
             for (int e = threadIdx.x; e < k; e += blockDim.x) {
-                auto idx = i[shape::getIndexOffset(e, iTadShapeInfo, k)];
+                auto idx = i[shape::getIndexOffset(e, iTadShapeInfo)];
 
-                z[shape::getIndexOffset(e, zTadShapeInfo, k)] = x[shape::getIndexOffset(idx, xTadShapeInfo, tadLength)];
+                z[shape::getIndexOffset(e, zTadShapeInfo)] = x[shape::getIndexOffset(idx, xTadShapeInfo)];
             }
         }
     }
@@ -153,7 +153,7 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
 
                 // local max values/indices
                 for (int e = threadIdx.x; e < tadLength; e++) {
-                    auto value = x[shape::getIndexOffset(e, xTadShapeInfo, tadLength)];
+                    auto value = x[shape::getIndexOffset(e, xTadShapeInfo)];
 
                     // we'll compare this value to current stored ones
                     for (int f = 0; f < scanWidth; f++) {
@@ -180,8 +180,8 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
                 // at this point we know local minimum for next iteration
                 if (threadIdx.x == 0) {
                     localMaximum = tempValues[scanWidth - 1];
-                    z[shape::getIndexOffset(p, zTadShapeInfo, k)] = tempValues[scanWidth - 1];
-                    i[shape::getIndexOffset(p, iTadShapeInfo, k)] = tempIndices[scanWidth - 1];
+                    z[shape::getIndexOffset(p, zTadShapeInfo)] = tempValues[scanWidth - 1];
+                    i[shape::getIndexOffset(p, iTadShapeInfo)] = tempIndices[scanWidth - 1];
                 }
                 __syncthreads();
             }
@@ -194,8 +194,8 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
                         for (int tid = threadIdx.x; tid < k; tid += blockDim.x) {
                             auto top = 2 * tid + 1;
                             if (top < k) {
-                                auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo, k);
-                                auto t1 = shape::getIndexOffset(top, iTadShapeInfo, k);
+                                auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo);
+                                auto t1 = shape::getIndexOffset(top, iTadShapeInfo);
 
                                 if (i[t0] > i[t1]) {
                                     // swap indices first
@@ -215,8 +215,8 @@ int inTopKFunctor(nd4j::LaunchContext * context, const NDArray* predictions, con
                         for (int tid = threadIdx.x; tid < k; tid += blockDim.x) {
                             auto top = 2 * tid + 2;
                             if (top < k) {
-                                auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo, k);
-                                auto t1 = shape::getIndexOffset(top, iTadShapeInfo, k);
+                                auto t0 = shape::getIndexOffset(top - 1, iTadShapeInfo);
+                                auto t1 = shape::getIndexOffset(top, iTadShapeInfo);
 
                                 if (i[t0] > i[t1]) {
                                     // swap indices first
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
index c3e4f497e..0a707ffb3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/transforms.cu
@@ -54,9 +54,9 @@ __global__ static void invertPermutationCuda(const void* vx, const Nd4jLong* xSh
 
     for (Nd4jLong i = tid; i < len; i += totalThreads) {
 
-        const auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
+        const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
         const Nd4jLong index = x[xOffset];
-        const auto zOffset = shape::getIndexOffset(index, zShapeInfo, len);
+        const auto zOffset = shape::getIndexOffset(index, zShapeInfo);
         z[zOffset] = i;
     }
 }
@@ -112,15 +112,15 @@ __global__ static void traceCuda(const void* vx, const Nd4jLong* xShapeInfo, voi
 
     for (uint m = blockIdx.x; m < zLen; m += gridDim.x) {   // one block per each element of z, that is per each matrix
 
-        shape::index2coords(zRank, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), m, zLen, coords);
-        const auto zOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(zShapeInfo)), shape::stride(const_cast<Nd4jLong*>(zShapeInfo)), coords, zRank);
+        shape::index2coords(m, zShapeInfo, coords);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         sharedMem[threadIdx.x] = 0;
 
           for (uint i = threadIdx.x; i < diagLen; i += blockDim.x) {
 
             coords[zRank] = coords[zRank + 1] = i;
-            const auto xOffset = shape::getOffset(0, shape::shapeOf(const_cast<Nd4jLong*>(xShapeInfo)), shape::stride(const_cast<Nd4jLong*>(xShapeInfo)), coords, xRank);
+            const auto xOffset = shape::getOffset(xShapeInfo, coords);
             sharedMem[threadIdx.x] += x[xOffset];
           }
 
@@ -197,14 +197,14 @@ __global__ static void triuBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
 
     for (Nd4jLong i = tid; i < len; i += totalThreads) {
 
-        shape::index2coords(rank, zShapeInfo + 1, i, len, coords);
+        shape::index2coords(i, zShapeInfo, coords);
 
-        const auto zOffset = shape::getOffset(0, zShapeInfo + 1, zShapeInfo + rank + 1, coords, rank);
+        const auto zOffset = shape::getOffset(zShapeInfo, coords);
 
         if((coords[rank - 2] + diag > coords[rank - 1]))    // row + diag > col
             z[zOffset] = 0;
         else
-            z[zOffset] = x[areSameOffsets ? zOffset : shape::getOffset(0, xShapeInfo + 1, xShapeInfo + rank + 1, coords, rank)];
+            z[zOffset] = x[areSameOffsets ? zOffset : shape::getOffset(xShapeInfo, coords)];
     }
 }
 
@@ -263,7 +263,7 @@ __global__ static void tileBPCuda(const void* vx, const Nd4jLong* xShapeInfo, vo
 
     for (Nd4jLong i = tid; i < zLen; i += totalThreads) {
 
-        const auto zOffset = shape::getIndexOffset(i, zShapeInfo, zLen);
+        const auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 
         shape::outerArrayOffsets(xOffsets, i, xShapeInfo, zShapeInfo, memBuff);
 
@@ -329,8 +329,8 @@ __global__ static void clipByNormBPWholeArrCuda(const void* vx, const Nd4jLong*
     __syncthreads();
 
     // fill shared memory with array elements
-    const auto xVal = x[shape::getIndexOffset(tid, xShapeInfo, len)];
-    const auto yVal = y[shape::getIndexOffset(tid, yShapeInfo, len)];
+    const auto xVal = x[shape::getIndexOffset(tid, xShapeInfo)];
+    const auto yVal = y[shape::getIndexOffset(tid, yShapeInfo)];
 
     shMem[2*threadIdx.x]     = static_cast<Z>(xVal * xVal);   // for norm
     shMem[2*threadIdx.x + 1] = static_cast<Z>(xVal * yVal);   // for input * gradO
@@ -414,12 +414,12 @@ __global__ static void clipByNormBPCalcGradCuda(const void* vx, const Nd4jLong*
     }
     __syncthreads();
 
-    const auto yOffset = shape::getIndexOffset(tid, yShapeInfo, len);
-    const auto zOffset = shape::getIndexOffset(tid, zShapeInfo, len);
+    const auto yOffset = shape::getIndexOffset(tid, yShapeInfo);
+    const auto zOffset = shape::getIndexOffset(tid, zShapeInfo);
 
    if(norm > clipNormVal) {
 
-        const auto xOffset = shape::getIndexOffset(tid, xShapeInfo, len);
+        const auto xOffset = shape::getIndexOffset(tid, xShapeInfo);
 
         const Z factor1 = static_cast<Z>(1) / norm;             // 1 / norm
         const Z factor2 = factor1 / (norm * norm);              // 1 / (norm * norm * norm)
@@ -462,8 +462,8 @@ __global__ static void clipByNormBPTadsCuda(const void* vx, const Nd4jLong* xTad
 
     for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) {
 
-        const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo, tadLen);
-        const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, tadLen);
+        const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo);
+        const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
 
         shMem[2*threadIdx.x]     = static_cast<Z>(xTad[xOffset] * xTad[xOffset]);   // for norm
         shMem[2*threadIdx.x + 1] = static_cast<Z>(xTad[xOffset] * yTad[yOffset]);   // for input * gradO
@@ -491,12 +491,12 @@ __global__ static void clipByNormBPTadsCuda(const void* vx, const Nd4jLong* xTad
 
     for (uint i = threadIdx.x; i < tadLen; i += blockDim.x) {
 
-        const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo, tadLen);
-        const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo, tadLen);
+        const auto yOffset = shape::getIndexOffset(i, yTadShapeInfo);
+        const auto zOffset = shape::getIndexOffset(i, zTadShapeInfo);
 
         if(norm > clipNormVal) {
 
-            const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo, tadLen);
+            const auto xOffset = shape::getIndexOffset(i, xTadShapeInfo);
 
             const Z factor1 = static_cast<Z>(1) / norm;             // 1 / norm
             const Z factor2 = factor1 / (norm * norm);              // 1 / (norm * norm * norm)
@@ -563,23 +563,25 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 }
 
     template <typename T>
-    static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, Nd4jLong len, nd4j::graph::RandomGenerator* rng) {
+    static __global__ void swapShuffleKernel(T* input, Nd4jLong* shape, Nd4jLong firstDim, nd4j::graph::RandomGenerator* rng) {
         auto tid = blockIdx.x * blockDim.x;
         auto step = blockDim.x * gridDim.x;
 
         for (int i = firstDim - 1 - tid - threadIdx.x; i > 0; i -= step) {
             int r = rng->relativeInt(i) % i;
             if (i != r) {
-                T e0 = input[shape::getIndexOffset(i, shape, len)];
-                T e1 = input[shape::getIndexOffset(r, shape, len)];
+                const auto iOffset = shape::getIndexOffset(i, shape);
+                const auto rOffset = shape::getIndexOffset(r, shape);
+                T e0 = input[iOffset];
+                T e1 = input[rOffset];
                 //math::nd4j_swap<T>(input(i), input(r));
-                input[shape::getIndexOffset(i, shape, len)] = e1;
-                input[shape::getIndexOffset(r, shape, len)] = e0;
+                input[iOffset] = e1;
+                input[rOffset] = e0;
             }
         }
     }
     template <typename T>
-    static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, Nd4jLong len, int* indices, nd4j::graph::RandomGenerator* rng) {
+    static __global__ void fillShuffleKernel(T* input, Nd4jLong* inputShape, T* output, Nd4jLong* outputShape, Nd4jLong firstDim, int* indices, nd4j::graph::RandomGenerator* rng) {
 
 //        PRAGMA_OMP_PARALLEL_FOR_IF((firstDim-1) > Environment::getInstance()->tadThreshold())
         auto tid = blockIdx.x * blockDim.x;
@@ -587,9 +589,9 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
 
         for(int i = firstDim - 1 - tid - threadIdx.x; i > 0; i -= step) {
             int r = rng->relativeInt(i) % i;
-            output[shape::getIndexOffset(i, outputShape, len)] = input[shape::getIndexOffset(indices[r], inputShape, len)];
+            output[shape::getIndexOffset(i, outputShape)] = input[shape::getIndexOffset(indices[r], inputShape)];
             if(i != r) {
-                output[shape::getIndexOffset(r, outputShape, len)] = input[shape::getIndexOffset(indices[i], inputShape, len)];
+                output[shape::getIndexOffset(r, outputShape)] = input[shape::getIndexOffset(indices[i], inputShape)];
 //                output.p(r, input.e<T>(indices[i]));
 //                math::nd4j_swap<int>(indices[i], indices[r]);
                 atomicExch(&indices[i], indices[r]);
@@ -618,7 +620,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
             cudaMemcpy(dRandom, &rng, sizeof(nd4j::graph::RandomGenerator), cudaMemcpyHostToDevice);
             T* inputBuf = reinterpret_cast<T*>(input.specialBuffer());
             if(isInplace) {
-                swapShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), firstDim, input.lengthOf(), dRandom);
+                swapShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), firstDim, dRandom);
             }
             else {
                 std::vector<int> indices(firstDim);
@@ -628,7 +630,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
                 PointersManager pointersManager(context, "helper::randomShuffle_");
                 int* indicesDev = reinterpret_cast<int*>(pointersManager.replicatePointer(indices.data(), indices.size() * sizeof(int)));
                 T* outputBuf = reinterpret_cast<T*>(output.specialBuffer());
-                fillShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), outputBuf, output.specialShapeInfo(), firstDim, input.lengthOf(), indicesDev, dRandom);
+                fillShuffleKernel<T><<<128, 256, 1024, *stream>>>(inputBuf, input.specialShapeInfo(), outputBuf, output.specialShapeInfo(), firstDim, indicesDev, dRandom);
                 pointersManager.synchronize();
             }
 //            rng.rewindH(firstDim - 1);
@@ -704,7 +706,7 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
             }
             __syncthreads();
             for (int j = threadIdx.x; j < len; j+= blockDim.x) {
-                auto xIndex = shape::getIndexOffset(j, shape, len);
+                auto xIndex = shape::getIndexOffset(j, shape);
 
                 if(norm2Buf[arr] > clipNorm)
                 z[xIndex] *= clipNorm / norm2Buf[arr]; // case with ews = 1 and ordering is 'c'
@@ -714,23 +716,22 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
     ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     template <typename T>
     static __global__ void clipByNormKernel(Nd4jLong numOfSubArrs, T* inputBuffer, Nd4jLong* shape, Nd4jLong* inputOffsets, T* outputBuffer, Nd4jLong* outputShape, Nd4jLong* outputOffsets, T* norm2Buf, Nd4jLong* norm2shape, T clipNorm) {
+
         for (Nd4jLong arr = blockIdx.x; arr < numOfSubArrs; arr += gridDim.x) {
             __shared__ T* x, *z;
-            __shared__ Nd4jLong lenX, lenZ;
+            __shared__ Nd4jLong lenZ;
             __shared__ T norm2;
 
             if (threadIdx.x == 0) {
-                lenX = shape::length(shape);
                 x = inputBuffer + inputOffsets[arr];
                 z = outputBuffer + outputOffsets[arr];
                 lenZ = shape::length(outputShape);
-                norm2 = norm2Buf[shape::getIndexOffset(arr, norm2shape, numOfSubArrs)];
-                //printf("%d: %lf (vs %lf) %lld %lld\n", arr, norm2, clipNorm, lenX, lenZ);
+                norm2 = norm2Buf[shape::getIndexOffset(arr, norm2shape)];
             }
             __syncthreads();
             for (Nd4jLong j = threadIdx.x; j < lenZ; j+= blockDim.x) {
-                auto xIndex = shape::getIndexOffset(j, shape, lenX);
-                auto zIndex = shape::getIndexOffset(j, outputShape, lenZ);
+                auto xIndex = shape::getIndexOffset(j, shape);
+                auto zIndex = shape::getIndexOffset(j, outputShape);
                 if(norm2 > clipNorm) {
                     z[zIndex] = x[xIndex] * clipNorm / norm2; // case with ews = 1 and ordering is 'c'
                 } else {
@@ -916,8 +917,8 @@ void clipByNormBP(nd4j::LaunchContext* context, const NDArray& input, const NDAr
                 else outputBuf[e] = inputBuf[e];
             }
             else {
-                auto inputOffset = shape::getIndexOffset(e, inputShape, length);
-                auto outputOffset = shape::getIndexOffset(e, outputShape, length);
+                auto inputOffset = shape::getIndexOffset(e, inputShape);
+                auto outputOffset = shape::getIndexOffset(e, outputShape);
                 if (inputBuf[inputOffset] > rightBound) outputBuf[outputOffset] = (T) rightBound;
                 else if (inputBuf[inputOffset] < leftBound) outputBuf[outputOffset] = (T) leftBound;
                 else outputBuf[outputOffset] = inputBuf[outputOffset];
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/weights.cu b/libnd4j/include/ops/declarable/helpers/cuda/weights.cu
index 55f859295..622732d7d 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/weights.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/weights.cu
@@ -29,7 +29,7 @@ namespace helpers {
     template <typename T>
     static __device__ void adjustWeightsKernelD(void* inputBuffer,   Nd4jLong* inputShape,
                                                void* weightsBuffer, Nd4jLong* weightsShape,
-                                               void* outputBuffer,  Nd4jLong inputLength, Nd4jLong weightsLength,
+                                               void* outputBuffer,  Nd4jLong inputLength,
                                                Nd4jLong outputLength, int val) {
     //    typedef Nd4jLong T;
         auto tid = threadIdx.x;
@@ -39,13 +39,13 @@ namespace helpers {
         //for (int e = 0; e < inputLength; e++) {
         for (Nd4jLong e = tid; e < inputLength; e += blockDim.x) {
 
-            Nd4jLong xOffset = shape::getIndexOffset(e, inputShape, inputLength);
+            Nd4jLong xOffset = shape::getIndexOffset(e, inputShape);
             int current = *(reinterpret_cast<int*>(inputBuffer) + xOffset);
             if (current == val) {
                 //printf("%lld\n", xOffset);
-                //Nd4jLong zOffset = shape::getIndexOffset(val, outputShape, outputLength);
+                //Nd4jLong zOffset = shape::getIndexOffset(val, outputShape);
                 if (weightsBuffer != nullptr) {
-                    Nd4jLong yOffset = shape::getIndexOffset(e, weightsShape, weightsLength);
+                    Nd4jLong yOffset = shape::getIndexOffset(e, weightsShape);
                     //atomicAdd();
                     //*reinterpret_cast<int *>(outputBuffer) +=  reinterpret_cast<int *>(weightsBuffer)[yOffset];
                     nd4j::math::atomics::nd4j_atomicAdd(reinterpret_cast<T *>(outputBuffer), reinterpret_cast<T *>(weightsBuffer)[yOffset]); //output->p(val, output->e<T>(val) + 1);
@@ -74,22 +74,19 @@ namespace helpers {
         //auto tid = blockIdx.x * blockDim.x + threadIdx.x; // * blockDim.x; // + threadIdx.x;
         int threadCount = gridDim.x * blockDim.x;
         Nd4jLong inputLength = shape::length(inputShape);
-        Nd4jLong weightsLength = 0;
-        if (weightsBuffer != nullptr)
-            weightsLength = shape::length(weightsShape);
 
         Nd4jLong outputLength = shape::length(outputShape);
-        Nd4jLong borderLen = 1;//outputLength / gridDim.x + outputLength % gridDim.x;
+        Nd4jLong borderLen = 1;
 
         for (Nd4jLong e = blockIdx.x; e < outputLength; e += threadCount) {
         //if (blockIdx.x < outputLength) {
             //if (e + threadCount < outputLength) {
-            Nd4jLong zOffset = shape::getIndexOffset(e, outputShape, outputLength);
+            Nd4jLong zOffset = shape::getIndexOffset(e, outputShape);
             //printf("%d %d %d\n", blockIdx.x, blockDim.x, threadIdx.x);
             //Nd4jLong borderLen = 1;
             T* outputBufferZ = reinterpret_cast<T*>(outputBuffer) + zOffset;
             adjustWeightsKernelD<T>(inputBuffer, inputShape, weightsBuffer, weightsShape, (void*)outputBufferZ,
-                                 inputLength, weightsLength, outputLength, (int)zOffset);
+                                 inputLength, outputLength, (int)zOffset);
 
         }
     }
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
index b131ff83f..ada547ac3 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/zeta.cu
@@ -46,9 +46,9 @@ __global__ static void zetaCuda(const void *vx, const Nd4jLong *xShapeInfo,
 
     for (int i = tid; i < len; i += totalThreads) {
 
-        const auto xOffset = shape::getIndexOffset(i, xShapeInfo, len);
-        const auto qOffset = shape::getIndexOffset(i, qShapeInfo, len);
-        const auto zOffset = shape::getIndexOffset(i, zShapeInfo, len);
+        const auto xOffset = shape::getIndexOffset(i, xShapeInfo);
+        const auto qOffset = shape::getIndexOffset(i, qShapeInfo);
+        const auto zOffset = shape::getIndexOffset(i, zShapeInfo);
 
         z[zOffset] = zetaScalar<T>(x[xOffset], q[qOffset]);
     }
diff --git a/libnd4j/include/ops/declarable/helpers/flatten.h b/libnd4j/include/ops/declarable/helpers/flatten.h
index 05421383f..0513e45ea 100644
--- a/libnd4j/include/ops/declarable/helpers/flatten.h
+++ b/libnd4j/include/ops/declarable/helpers/flatten.h
@@ -24,12 +24,45 @@
 #include <vector>
 #include <NDArray.h>
 
-namespace nd4j {
-    namespace ops {
-        namespace helpers {
-            void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
+namespace nd4j    {
+namespace ops     {
+namespace helpers {
+
+
+//////////////////////////////////////////////////////////////////////
+void flatten(nd4j::LaunchContext *context, std::vector<NDArray*> &inputs, NDArray *output, char order);
+
+
+//////////////////////////////////////////////////////////////////////
+INLINEDEF _CUDA_HD Nd4jLong getIndexOffsetOrdered(Nd4jLong index, const Nd4jLong *shapeInfo, const char order) {
+
+    Nd4jLong offset = 0;
+
+    if (order == 'c') {
+
+        for(uint i = shapeInfo[0]; i > 1; --i) {
+            offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]];
+            index /= shapeInfo[i];
         }
+
+        offset += index * shapeInfo[1 + shapeInfo[0]];  // last iteration
     }
+    else {
+
+        for(uint i = 1; i < shapeInfo[0]; ++i) {
+            offset += (index % shapeInfo[i]) * shapeInfo[i + shapeInfo[0]];
+            index /= shapeInfo[i];
+        }
+
+        offset += index * shapeInfo[2 * shapeInfo[0]];  // last iteration
+    }
+
+    return offset;
+}
+
+
+}
+}
 }
 
 #endif //DEV_TESTS_FLATTEN_H
diff --git a/libnd4j/include/ops/declarable/helpers/impl/where.cpp b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
index 120ecdf16..5ca54ced9 100644
--- a/libnd4j/include/ops/declarable/helpers/impl/where.cpp
+++ b/libnd4j/include/ops/declarable/helpers/impl/where.cpp
@@ -31,9 +31,9 @@ namespace nd4j {
 
                 Nd4jLong idx[MAX_RANK];
                 for (int e = 0; e < condition.lengthOf(); e++) {
-                    shape::index2coords(condition.rankOf(), condition.shapeOf(), e, idx);
+                    shape::index2coords(e, condition.getShapeInfo(), idx);
 
-                    auto offset = shape::getOffset(0, condition.shapeOf(), condition.stridesOf(), idx, condition.rankOf());
+                    auto offset = shape::getOffset(condition.getShapeInfo(), idx);
                     if (condition.e<bool>(offset)) {
                         auto array = NDArrayFactory::create_('c', {1, condition.rankOf()}, output.dataType(), output.getContext());
                         for (int f = 0; f < condition.rankOf(); f++)
diff --git a/libnd4j/include/ops/impl/specials.cpp b/libnd4j/include/ops/impl/specials.cpp
index 5eb64fdb4..85642d6c8 100644
--- a/libnd4j/include/ops/impl/specials.cpp
+++ b/libnd4j/include/ops/impl/specials.cpp
@@ -231,12 +231,12 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
     Nd4jLong SpecialMethods<T>::getPosition(Nd4jLong *xShapeInfo, Nd4jLong index) {
         auto xEWS = shape::elementWiseStride(xShapeInfo);
 
-        if (xEWS == 1) 
-            return index;        
+        if (xEWS == 1)
+            return index;
         else if (xEWS > 1)
             return index * xEWS;
-        else 
-            return shape::getIndexOffset(index, xShapeInfo, shape::length(xShapeInfo));
+        else
+            return shape::getIndexOffset(index, xShapeInfo);
     }
 
     template<typename T>
@@ -457,10 +457,9 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVa
 
     template <typename X, typename Y>
     void quickSort_parallel_internal_key(X* key, Nd4jLong *xShapeInfo, Y* values, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        auto length = shape::length(xShapeInfo);
         int i = left, j = right;
         X ktmp;
-        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo, length)];
+        X pivot = key[shape::getIndexOffset((left + right) / 2, xShapeInfo)];
 
         Y vtmp;
 
@@ -468,35 +467,35 @@ PRAGMA_OMP_PARALLEL_FOR_ARGS(schedule(guided) proc_bind(close) reduction(+:retVa
             /* PARTITION PART */
             while (i <= j) {
                 if (descending) {
-                    while (key[shape::getIndexOffset(i, xShapeInfo, length)] > pivot)
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] > pivot)
                         i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo, length)] < pivot)
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] < pivot)
                         j--;
                     if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)];
-                        key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)];
-                        key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp;
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
 
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo, length)];
-                        values[shape::getIndexOffset(i, yShapeInfo, length)] = values[shape::getIndexOffset(j, yShapeInfo, length)];
-                        values[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp;
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
 
                         i++;
                         j--;
                     }
                 } else {
-                    while (key[shape::getIndexOffset(i, xShapeInfo, length)] < pivot)
+                    while (key[shape::getIndexOffset(i, xShapeInfo)] < pivot)
                         i++;
-                    while (key[shape::getIndexOffset(j, xShapeInfo, length)] > pivot)
+                    while (key[shape::getIndexOffset(j, xShapeInfo)] > pivot)
                         j--;
                     if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)];
-                        key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)];
-                        key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp;
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
 
-                        vtmp = values[shape::getIndexOffset(i, yShapeInfo, length)];
-                        values[shape::getIndexOffset(i, yShapeInfo, length)] = values[shape::getIndexOffset(j, yShapeInfo, length)];
-                        values[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp;
+                        vtmp = values[shape::getIndexOffset(i, yShapeInfo)];
+                        values[shape::getIndexOffset(i, yShapeInfo)] = values[shape::getIndexOffset(j, yShapeInfo)];
+                        values[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
 
                         i++;
                         j--;
@@ -523,10 +522,9 @@ PRAGMA_OMP_TASK
 
     template <typename X, typename Y>
     void quickSort_parallel_internal_value(X* key, Nd4jLong *xShapeInfo, Y* value, Nd4jLong *yShapeInfo, int left, int right, int cutoff, bool descending) {
-        auto length = shape::length(xShapeInfo);
         int i = left, j = right;
         X ktmp;
-        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo, length)];
+        Y pivot = value[shape::getIndexOffset((left + right) / 2, yShapeInfo)];
 
         Y vtmp;
 
@@ -534,35 +532,35 @@ PRAGMA_OMP_TASK
             /* PARTITION PART */
             while (i <= j) {
                 if (descending) {
-                    while (value[shape::getIndexOffset(i, yShapeInfo, length)] > pivot)
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] > pivot)
                         i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo, length)] < pivot)
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] < pivot)
                         j--;
                     if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)];
-                        key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)];
-                        key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp;
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
 
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo, length)];
-                        value[shape::getIndexOffset(i, yShapeInfo, length)] = value[shape::getIndexOffset(j, yShapeInfo, length)];
-                        value[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp;
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
 
                         i++;
                         j--;
                     }
                 } else {
-                    while (value[shape::getIndexOffset(i, yShapeInfo, length)] < pivot)
+                    while (value[shape::getIndexOffset(i, yShapeInfo)] < pivot)
                         i++;
-                    while (value[shape::getIndexOffset(j, yShapeInfo, length)] > pivot)
+                    while (value[shape::getIndexOffset(j, yShapeInfo)] > pivot)
                         j--;
                     if (i <= j) {
-                        ktmp = key[shape::getIndexOffset(i, xShapeInfo, length)];
-                        key[shape::getIndexOffset(i, xShapeInfo, length)] = key[shape::getIndexOffset(j, xShapeInfo, length)];
-                        key[shape::getIndexOffset(j, xShapeInfo, length)] = ktmp;
+                        ktmp = key[shape::getIndexOffset(i, xShapeInfo)];
+                        key[shape::getIndexOffset(i, xShapeInfo)] = key[shape::getIndexOffset(j, xShapeInfo)];
+                        key[shape::getIndexOffset(j, xShapeInfo)] = ktmp;
 
-                        vtmp = value[shape::getIndexOffset(i, yShapeInfo, length)];
-                        value[shape::getIndexOffset(i, yShapeInfo, length)] = value[shape::getIndexOffset(j, yShapeInfo, length)];
-                        value[shape::getIndexOffset(j, yShapeInfo, length)] = vtmp;
+                        vtmp = value[shape::getIndexOffset(i, yShapeInfo)];
+                        value[shape::getIndexOffset(i, yShapeInfo)] = value[shape::getIndexOffset(j, yShapeInfo)];
+                        value[shape::getIndexOffset(j, yShapeInfo)] = vtmp;
 
                         i++;
                         j--;
diff --git a/libnd4j/include/ops/special_accumulation_ops.h b/libnd4j/include/ops/special_accumulation_ops.h
index 7a587e754..3f2b2ed1d 100644
--- a/libnd4j/include/ops/special_accumulation_ops.h
+++ b/libnd4j/include/ops/special_accumulation_ops.h
@@ -114,15 +114,15 @@ namespace simdOps {
 					tadLength = shape::length(tadOnlyShapeInfo);//shape::tadLength(xShapeInfo, dimension, dimensionLength);
 					numTads = shape::length(xShapeInfo) / tadLength;
 				}
-				__syncthreads();				
+				__syncthreads();
 
 				for (int r = blockIdx.x; r < numTads; r += gridDim.x) {
 					auto tadOffsetForBlock = tadOffsets[r];
 
 					sPartials[threadIdx.x] = startingValue(dx + tadOffsetForBlock);
 
-					for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {						
-						auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo, tadLength);
+					for (int i = threadIdx.x; i < tadLength; i += blockDim.x) {
+						auto xOffset = tadOffsetForBlock + shape::getIndexOffset(i, tadOnlyShapeInfo);
 						sPartials[threadIdx.x] = update(sPartials[threadIdx.x], op(dx[xOffset], result[r]), extraParams);
 					}
 					__syncthreads();
@@ -198,8 +198,8 @@ namespace simdOps {
                     auto offset = tadOffsets[i];
                     T start = startingValue(x + offset);
 
-                    for (int j = 0; j < tadLength; j++) {                        
-                        auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo, tadLength);
+                    for (int j = 0; j < tadLength; j++) {
+                        auto xOffset = offset + shape::getIndexOffset(j, tadOnlyShapeInfo);
                         start = update(start, op(x[xOffset], result[i]), extraParams);
                     }
 
diff --git a/libnd4j/include/ops/special_ops.h b/libnd4j/include/ops/special_ops.h
index 33cce53c6..8f6ef6b5b 100644
--- a/libnd4j/include/ops/special_ops.h
+++ b/libnd4j/include/ops/special_ops.h
@@ -81,8 +81,8 @@ namespace simdOps {
 		static inline __device__ void execSpecialCuda(
 			             T *dx, Nd4jLong *xShapeBuffer,
 			             Z *result, Nd4jLong *zShapeBuffer,
-			             Z *extraParams, 
-                         int *allocationPointer, Z *reductionPointer, 
+			             Z *extraParams,
+                         int *allocationPointer, Z *reductionPointer,
                          Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 			__shared__ int kH;
@@ -119,7 +119,7 @@ namespace simdOps {
             __shared__ int kHEff;
             __shared__ int kWEff;
 			__shared__ bool fOrder;
-		
+
 
 			if (threadIdx.x == 0) {
 				kH = (int)extraParams[0];
@@ -266,7 +266,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
     const Nd4jLong sH = (int)extraParams[2];
     const Nd4jLong sW = (int)extraParams[3];
     const Nd4jLong pH = (int)extraParams[4];
-    const Nd4jLong pW = (int)extraParams[5];    
+    const Nd4jLong pW = (int)extraParams[5];
     const Nd4jLong dH = (int)extraParams[6];
     const Nd4jLong dW = (int)extraParams[7];
     Nd4jLong poolingMode = (int)extraParams[9];
@@ -285,7 +285,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
     const int iH = shape::sizeAt(inShapeBuffer, 2);
     const int iW = shape::sizeAt(inShapeBuffer, 3);
     const int oH = shape::sizeAt(outShapeBuffer, 2);
-    const int oW = shape::sizeAt(outShapeBuffer, 3);            
+    const int oW = shape::sizeAt(outShapeBuffer, 3);
     const Nd4jLong iStride0 = shape::stride(inShapeBuffer)[0];
     const Nd4jLong iStride1 = shape::stride(inShapeBuffer)[1];
     const Nd4jLong iStride2 = shape::stride(inShapeBuffer)[2];
@@ -296,28 +296,28 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
     const Nd4jLong oStride3 = shape::stride(outShapeBuffer)[3];
 
     const Nd4jLong iStep2 = dH*iStride2;
-    const Nd4jLong iStep3 = dW*iStride3;        
+    const Nd4jLong iStep3 = dW*iStride3;
     const int kProd  = kH*kW;
-    const T iStep2Inv = 1./iStep2; 
+    const T iStep2Inv = 1./iStep2;
     const T iStep3Inv = 1./iStep3;
 
     Nd4jLong hstart, wstart, hend, wend;
     T sum, *pIn;
 
-    if(poolingMode == 0) {        // max 
+    if(poolingMode == 0) {        // max
         PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
         for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {                                                            
+            for(int c = 0; c < iC; ++c) {
                 for(int oh = 0; oh < oH; ++oh) {
                     for(int ow = 0; ow < oW; ++ow) {
-                        
+
                         pIn  = in  + b * iStride0 + c * iStride1;
-                        
+
                         hstart = oh * sH - pH;
-                        wstart = ow * sW - pW;                        
+                        wstart = ow * sW - pW;
                         hend = hstart + kHEff;
                         wend = wstart + kWEff;
-                        
+
                         if(hstart < 0)
                             hstart += dH * (Nd4jLong)nd4j::math::nd4j_ceil<T,Nd4jLong>(static_cast<T>(-hstart) / static_cast<T>(dH));
                         if(wstart < 0)
@@ -333,8 +333,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         wend   *= iStride3;
 
                         sum = -nd4j::DataTypeUtils::max<Z>();
-                                                                    
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) 
+
+                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
                             for (Nd4jLong kw = wstart; kw < wend; kw += iStep3) {
                                 T val = pIn[kh + kw];
                                     if (val > sum)
@@ -344,16 +344,16 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                     }
                 }
             }
-        }    
+        }
     }
-/*************************************************************************/    
+/*************************************************************************/
     else if(poolingMode == 1) {      // avg
         PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
         for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {                                                            
+            for(int c = 0; c < iC; ++c) {
                 for(int oh = 0; oh < oH; ++oh) {
                     for(int ow = 0; ow < oW; ++ow) {
-                        
+
                         pIn  = in  + b * iStride0 + c * iStride1;
 
                         hstart = oh * sH - pH;
@@ -376,30 +376,30 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         wend   *= iStride3;
 
                         sum = static_cast<Z>(0.);
-                                            
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) 
+
+                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
                             for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
                                 sum += pIn[kh + kw];
-                                
+
                         if ((int) extraParam0 == 0)         //Exclude padding
                             sum /= static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(hend-hstart) / static_cast<double>(iStep2))) * static_cast<T>(nd4j::math::nd4j_ceil<double,T>(static_cast<double>(wend-wstart) / static_cast<double>(iStep3)));   //Accounts for dilation
                         else if ((int) extraParam0 == 1)    //Include padding
                             sum /= kProd;
-                    
+
                         out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                     }
                 }
             }
         }
-    }    
-/*************************************************************************/    
+    }
+/*************************************************************************/
     else if(poolingMode == 2) {  // pnorm
         PRAGMA_OMP_PARALLEL_FOR_ARGS(private(pIn, sum, hstart, wstart, hend, wend) collapse(2))
         for(int b = 0; b < bS; ++b) {
-            for(int c = 0; c < iC; ++c) {                                                            
+            for(int c = 0; c < iC; ++c) {
                 for(int oh = 0; oh < oH; ++oh) {
                     for(int ow = 0; ow < oW; ++ow) {
-                        
+
                         pIn  = in  + b * iStride0 + c * iStride1;
 
                         hstart = oh * sH - pH;
@@ -422,13 +422,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         wend   *= iStride3;
 
                         sum = static_cast<T>(0.);
-                                                                    
-                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2) 
+
+                        for (Nd4jLong kh = hstart; kh < hend; kh += iStep2)
                             for (Nd4jLong kw = wstart; kw < wend; kw += iStep3)
                                 sum += nd4j::math::nd4j_pow<T, T, T>(nd4j::math::nd4j_abs<T>(pIn[kh + kw]), extraParam0);
-                                
+
                         sum = nd4j::math::nd4j_pow<T,T,T>(sum, (T) 1. / extraParam0);
-                                                          
+
                         out[b * oStride0 + c * oStride1 + oh * oStride2 + ow * oStride3] = sum;
                     }
                 }
@@ -482,7 +482,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
     }
 
 	template<typename T>
-	class 
+	class
 	Im2col {
 	public:
 		static const bool requiresSpecial = true;
@@ -502,8 +502,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 		static inline __device__ void execSpecialCuda(
 			                             T *dx, Nd4jLong *xShapeBuffer,
 			                             T *result, Nd4jLong *zShapeBuffer,
-			                             T *extraParams, 
-                                         int *allocationPointer, T *reductionPointer, 
+			                             T *extraParams,
+                                         int *allocationPointer, T *reductionPointer,
                                          Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 			/*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/
@@ -606,7 +606,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 			T *extraParams, Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 			/*kernel[0], kernel[1], stride[0], stride[1], padding[0], padding[1], 0, false*/
 
-			// [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]        
+			// [bS, iC, iH, iW] is convoluted to [bS, iC, kH, kW, oH, oW]
 
 			int kH = (int)extraParams[0];
 			int kW = (int)extraParams[1];
@@ -615,7 +615,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 			int pH = (int)extraParams[4];
 			int pW = (int)extraParams[5];
 			int dH = (int)extraParams[6];			//Dilation, height/y dimension
-			int dW = (int)extraParams[7];			//Dilation, width/x dimension            
+			int dW = (int)extraParams[7];			//Dilation, width/x dimension
             T zeroPadVal = extraParams[9];
 
             auto colShape  = shape::shapeOf(colShapeBuffer);
@@ -642,33 +642,33 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 
             T *col, *im;
             int imRow, imCol;
-            
+
             if (shape::order(imShapeBuffer) == 'c' &&  shape::order(colShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(imShapeBuffer) && shape::strideDescendingCAscendingF(colShapeBuffer)) {
 
                 PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2))
                 for (int b = 0; b < bS; b++) {
-                    for (int c = 0; c < iC; ++c) {        
-                        for (int kRow = 0; kRow < kH; ++kRow) {                        
-                            for (int kCol = 0; kCol < kW; ++kCol) {                            
+                    for (int c = 0; c < iC; ++c) {
+                        for (int kRow = 0; kRow < kH; ++kRow) {
+                            for (int kCol = 0; kCol < kW; ++kCol) {
                                 for (int colH = 0; colH < oH; ++colH) {
-                                    for (int colW = 0; colW < oW; ++colW) {                    
-                                
+                                    for (int colW = 0; colW < oW; ++colW) {
+
                                         imRow = (-pH + kRow * dH) + colH*sH;
                                         imCol = (-pW + kCol * dW) + colW*sW;
-                                        
+
                                         col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
-                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3; 
-                                                    
+                                        im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
+
                                         if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
                                             *col = zeroPadVal;
-                                        else 
+                                        else
                                             *col = *im;
                                     }
                                 }
                             }
                         }
                     }
-                }  
+                }
             }
             else {
 
@@ -677,18 +677,18 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                     for (int colH = 0; colH < oH; ++colH) {
                         for (int colW = 0; colW < oW; ++colW) {
                             for (int c = 0; c < iC; ++c) {
-                                for (int kRow = 0; kRow < kH; ++kRow) {                        
-                                    for (int kCol = 0; kCol < kW; ++kCol) {                            
-                        
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+
                                         imRow = (-pH + kRow * dH) + colH*sH;
                                         imCol = (-pW + kCol * dW) + colW*sW;
-                                        
+
                                         col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
                                         im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
-                                                    
+
                                         if (static_cast<unsigned>(imRow) >= static_cast<unsigned>(iH) || static_cast<unsigned>(imCol) >= static_cast<unsigned>(iW))
                                             *col = zeroPadVal;
-                                        else 
+                                        else
                                             *col = *im;
                                     }
                                 }
@@ -743,8 +743,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 		static inline __device__ void execSpecialCuda(
 			                 T *dx, Nd4jLong *xShapeBuffer,
 			                 Z *result, Nd4jLong *zShapeBuffer,
-			                 Z *extraParams, 
-                             int *allocationPointer, Z *reductionPointer, 
+			                 Z *extraParams,
+                             int *allocationPointer, Z *reductionPointer,
                              Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 
@@ -782,8 +782,8 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 		static inline __device__ void execSpecialCuda(
 			X *dx, Nd4jLong *xShapeBuffer,
 			X *result, Nd4jLong *zShapeBuffer,
-			X *extraParams, int *allocationPointer, 
-            X *reductionPointer, 
+			X *extraParams, int *allocationPointer,
+            X *reductionPointer,
             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 		    __shared__ int strideex, stridech, stridekrow, stridekcol, striderow, stridecol, kernelHeight, kernelWidth, strideY, strideX, padHeight, padWidth, imgHeight, imgWidth, dY, dX, samples, depth, imgH, imgW, height_col, width_col, n, kEffectiveW, kEffectiveH;
@@ -856,7 +856,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 					for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
 						int h_k = (h_im - h_col * strideY);
 						int w_k = (w_im - w_col * strideX);
-						
+
 						if(h_k % dY == 0 && w_k % dX == 0){
 							h_k /= dY;
 							w_k /= dX;
@@ -892,7 +892,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
             auto colShape  = shape::shapeOf(colShapeBuffer);
             auto colStride = shape::stride(colShapeBuffer);
             auto imShape = shape::shapeOf(imShapeBuffer);
-            auto imStride = shape::stride(imShapeBuffer);            
+            auto imStride = shape::stride(imShapeBuffer);
 
             const int sH = (int)extraParams[0];
             const int sW = (int)extraParams[1];
@@ -900,13 +900,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
             const int pW = (int)extraParams[3];
             const int iH = (int)extraParams[4];
             const int iW = (int)extraParams[5];
-            const int dH = (int)extraParams[6];     
-            const int dW = (int)extraParams[7];     
+            const int dH = (int)extraParams[6];
+            const int dW = (int)extraParams[7];
 
             const int bS = imShape[0];
             const int iC = imShape[1];
             const int kH = colShape[2];
-            const int kW = colShape[3];                    
+            const int kW = colShape[3];
             const int oH = colShape[4];
             const int oW = colShape[5];
             const Nd4jLong colStride0 = colStride[0];
@@ -932,12 +932,12 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
             if (shape::order(colShapeBuffer) == 'c' &&  shape::order(imShapeBuffer) == 'c' && shape::strideDescendingCAscendingF(colShapeBuffer) && shape::strideDescendingCAscendingF(imShapeBuffer)) {
 
                 PRAGMA_OMP_PARALLEL_FOR_ARGS(private(col, im, imRow, imCol) collapse(2))
-                for (int b = 0; b < bS; b++) {        
-                    for (int c = 0; c < iC; ++c) {                    
-                        for (int kRow = 0; kRow < kH; ++kRow) {                        
-                            for (int kCol = 0; kCol < kW; ++kCol) {                            
+                for (int b = 0; b < bS; b++) {
+                    for (int c = 0; c < iC; ++c) {
+                        for (int kRow = 0; kRow < kH; ++kRow) {
+                            for (int kCol = 0; kCol < kW; ++kCol) {
                                 for (int colH = 0; colH < oH; ++colH) {
-                                    for (int colW = 0; colW < oW; ++colW) {                    
+                                    for (int colW = 0; colW < oW; ++colW) {
 
                                         imRow = (-pH + kRow * dH) + colH*sH;
                                         imCol = (-pW + kCol * dW) + colW*sW;
@@ -952,21 +952,21 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                             }
                         }
                     }
-                }  
+                }
             }
             else {
 
                 PRAGMA_OMP_PARALLEL_FOR_ARGS(private(im, col, imRow, imCol))
-                for (int b = 0; b < bS; b++) {        
+                for (int b = 0; b < bS; b++) {
                     for (int colH = 0; colH < oH; ++colH) {
                         for (int colW = 0; colW < oW; ++colW) {
-                            for (int c = 0; c < iC; ++c) {                        
-                                for (int kRow = 0; kRow < kH; ++kRow) {                        
-                                    for (int kCol = 0; kCol < kW; ++kCol) {                            
-                        
+                            for (int c = 0; c < iC; ++c) {
+                                for (int kRow = 0; kRow < kH; ++kRow) {
+                                    for (int kCol = 0; kCol < kW; ++kCol) {
+
                                         imRow = (-pH + kRow * dH) + colH*sH;
                                         imCol = (-pW + kCol * dW) + colW*sW;
-                                        
+
                                         col = colBuff + b*colStride0 + c*colStride1 + kRow*colStride2 + kCol*colStride3 + colH*colStride4 + colW*colStride5;
                                         im  = imBuff  + b*imStride0  + c*imStride1  + imRow*imStride2 + imCol*imStride3;
 
@@ -975,9 +975,9 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                                     }
                                 }
                             }
-                        }                           
+                        }
                     }
-                }  
+                }
             }
         }
 
@@ -1021,10 +1021,10 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 		static const bool requiresSpecial = true;
 
 #ifdef __CUDACC__
-		static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer, 
-                                                    X *result, Nd4jLong *zShapeBuffer, 
-                                                    X *extraParams, int *allocationPointer, 
-                                                    X *reductionPointer, 
+		static inline __device__ void execSpecialCuda(X *dx, Nd4jLong *xShapeBuffer,
+                                                    X *result, Nd4jLong *zShapeBuffer,
+                                                    X *extraParams, int *allocationPointer,
+                                                    X *reductionPointer,
                                                     Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
             __shared__ Nd4jLong xLength;
@@ -1064,12 +1064,12 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         dx[idx2] = dx[idx1];
                         dx[idx1] = tmp;
                     }
-                } 
-                else {                    
+                }
+                else {
 
 					for (int e = tid; e < xLength / 2; e += blockDim.x * gridDim.x) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer, xLength);
+                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
+                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
                         result[zOffset] = dx[xOffset];
 					}
                 }
@@ -1094,12 +1094,12 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                     for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
                         result[(sLength - e) * zEWS] = dx[e * xEWS];
                     }
-                } 
-                else {                  
+                }
+                else {
 
                     for (int e = tid; e < xLength; e += blockDim.x * gridDim.x) {
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer, xLength);
+                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
+                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
                         result[zOffset] = dx[xOffset];
                     }
                 }
@@ -1134,13 +1134,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         dx[idx2] = dx[idx1];
                         dx[idx1] = tmp;
                     }
-				} 
+				}
                 else {
 
                     PRAGMA_OMP_PARALLEL_FOR_SIMD
-                    for (Nd4jLong e = 0; e < xLength / 2; e++) {                        
-                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength);
-                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer, xLength);
+                    for (Nd4jLong e = 0; e < xLength / 2; e++) {
+                        auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
+                        auto zOffset = shape::getIndexOffset(sLength - e, xShapeBuffer);
 
                         result[zOffset] = dx[xOffset];
                     }
@@ -1160,13 +1160,13 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 					for (Nd4jLong e = 0; e < xLength; e++) {
 						result[(sLength - e) * zEWS] = dx[e * xEWS];
 					}
-				} 
+				}
                 else {
 
                     PRAGMA_OMP_PARALLEL_FOR_SIMD
 					for (Nd4jLong e = 0; e < xLength; e++) {
-						auto xOffset = shape::getIndexOffset(e, xShapeBuffer, xLength);
-                        auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer, xLength);
+						auto xOffset = shape::getIndexOffset(e, xShapeBuffer);
+                        auto zOffset = shape::getIndexOffset(sLength - e, zShapeBuffer);
 						result[zOffset] = dx[xOffset];
 					}
 				}
@@ -1192,7 +1192,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 			void *vx, Nd4jLong *xShapeBuffer,
 			void *vresult, Nd4jLong *zShapeBuffer,
 			void *vextraParams,
-			int *allocationPointer, void *reductionPointer, 
+			int *allocationPointer, void *reductionPointer,
             Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
             auto dx = reinterpret_cast<X *>(vx);
@@ -1263,10 +1263,10 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         tadShapeInfo = tadPack.primaryShapeInfo();
                         tadOffsets = tadPack.primaryOffsets();
                     }
-                    
+
                     const uint tadLen    = shape::length(tadShapeInfo);
                     const uint numOfTads = shape::length(xShapeInfo) / tadLen;
-        
+
                     if(shape::elementWiseStride(tadShapeInfo) == 1) {
 
                         PRAGMA_OMP_PARALLEL_FOR_SIMD
@@ -1277,18 +1277,18 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 
                             X max = -nd4j::DataTypeUtils::max<X>();
                             X sum = 0;
-                        
+
                             for(uint j = 0; j < tadLen; ++j)
-                                max = nd4j::math::nd4j_max<X>(max, inBuff[j]);            
-            
+                                max = nd4j::math::nd4j_max<X>(max, inBuff[j]);
+
                             for (uint j = 0; j < tadLen; ++j) {
                                 X temp = nd4j::math::nd4j_exp<X,X>(inBuff[j] - max);
                                 outBuff[j] = temp;
                                 sum += temp;
                             }
-            
+
                             for (uint j = 0; j < tadLen; ++j)
-                            outBuff[j] /= sum;            
+                            outBuff[j] /= sum;
                         }
                     }
                     else {
@@ -1300,17 +1300,17 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
                         shape::calcOffsets(tadShapeInfo, offsets);
 
                         PRAGMA_OMP_PARALLEL_FOR_SIMD
-                        for (uint i = 0; i < numOfTads; ++i) {                        
+                        for (uint i = 0; i < numOfTads; ++i) {
 
                             X* inBuff  = x  + tadOffsets[i];
                             X* outBuff = z + tadOffsets[i];
 
                             X max = -nd4j::DataTypeUtils::max<X>();
-                            X sum = 0.f;                                
+                            X sum = 0.f;
+
+                            for(uint j = 0; j < tadLen; ++j)
+                                max = nd4j::math::nd4j_max<X>(max, inBuff[offsets[j]]);
 
-                            for(uint j = 0; j < tadLen; ++j)                                 
-                                max = nd4j::math::nd4j_max<X>(max, inBuff[offsets[j]]);                            
-            
                             for (uint j = 0; j < tadLen; ++j) {
                                 X temp = nd4j::math::nd4j_exp<X,X>(inBuff[offsets[j]] - max);
                                 outBuff[offsets[j]] = temp;
@@ -1351,7 +1351,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 
                     delete[] maxResultShapeBuffer;
                     delete[] maxResult;
-                }                
+                }
             }
             else if (shape::isVector(xShapeInfo)) {
                 auto max = -nd4j::DataTypeUtils::max<X>();
@@ -1416,7 +1416,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
             			void *vx, Nd4jLong *xShapeBuffer,
             			void *vresult, Nd4jLong *zShapeBuffer,
             			void *vextraParams,
-            			int *allocationPointer, void *reductionPointer, 
+            			int *allocationPointer, void *reductionPointer,
                         Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
 			auto shape = shape::shapeOf(xShapeBuffer);
@@ -1578,7 +1578,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 			                 void *vx, Nd4jLong *xShapeBuffer,
 			                 void *vresult, Nd4jLong *zShapeBuffer,
 			                 void *vextraParams,
-			                 int *allocationPointer, void *reductionPointer, 
+			                 int *allocationPointer, void *reductionPointer,
                              Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
             auto dx = reinterpret_cast<X *>(vx);
@@ -1650,7 +1650,7 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
             auto dx = reinterpret_cast<X *>(vx);
             auto result = reinterpret_cast<X *>(vresult);
             auto extraParams = reinterpret_cast<X *>(vextraParams);
-            
+
 			if (shape::isMatrix(xShapeBuffer, 2)) {
 				auto shape = shape::shapeOf(xShapeBuffer);
 
@@ -1700,9 +1700,9 @@ static void execSpecial(T *in, Nd4jLong *inShapeBuffer, Z *out, Nd4jLong *outSha
 					}
 				}
 				else {
-                    
-                    for (int i = 0; i < len; i++) {                        
-                        Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer, len);
+
+                    for (int i = 0; i < len; i++) {
+                        Nd4jLong zOffset = shape::getIndexOffset(i, zShapeBuffer);
                         result[zOffset] = result[zOffset] * ((X) 1.0f - result[zOffset]);
                     }
                 }
@@ -2013,8 +2013,8 @@ PRAGMA_OMP_CRITICAL
 		static inline __device__ void execSpecialCuda(
 			             void *vx, Nd4jLong *xShapeBuffer,
 			             void *vresult, Nd4jLong *zShapeBuffer,
-			             void *vextraParams, int *allocationPointer, 
-                         void *reductionPointer, 
+			             void *vextraParams, int *allocationPointer,
+                         void *reductionPointer,
                          Nd4jLong *tadShapeInfo, Nd4jLong *tadOffsets) {
 
             auto dx = reinterpret_cast<X *>(vx);
@@ -2162,7 +2162,7 @@ PRAGMA_OMP_CRITICAL
                 //decompose in to several sub tads after
                 //moving all dimensions (in sorted order)
                 //to the back.
-                //permuted version of the x shape info for setting up the tad problem				
+                //permuted version of the x shape info for setting up the tad problem
 				auto tadShapeShapeInfo = tadShapeInfo;
 				if(tadShapeInfo==nullptr) {
                     auto tadPack = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(xShapeBuffer, dimension, dimensionLength);
@@ -2170,7 +2170,7 @@ PRAGMA_OMP_CRITICAL
 					tadShapeShapeInfo = tadPack.primaryShapeInfo();
 					tadOffsets = tadPack.primaryOffsets();
                     tadShapeInfo = tadShapeShapeInfo;
-				}						                                				
+				}
 
                 auto tadLength = shape::length(tadShapeInfo);//shape::tadLength(xShapeBuffer, dimension, dimensionLength);
                 auto tads = shape::length(xShapeBuffer) / tadLength;
diff --git a/libnd4j/include/ops/special_random_ops.h b/libnd4j/include/ops/special_random_ops.h
index 0d90c212a..1ae310ad4 100644
--- a/libnd4j/include/ops/special_random_ops.h
+++ b/libnd4j/include/ops/special_random_ops.h
@@ -111,24 +111,24 @@ namespace randomOps {
                     }
 //                    __syncthreads();  // Eliminated due RTX20xx specific
                 }
-            } 
+            }
             else {
-            
+
                 for (Nd4jLong i = tid; i < zLength; i+=blockDim.x * gridDim.x) {
 
-                    auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, zLength);
+                    auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
                     T prob = rng->relativeT<T>(i);
                     T cumProb = (T) 0.0f;
 
                     for (Nd4jLong f = 0; f < yLength; f++) {
-                        
-                        auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer, yLength);
+
+                        auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer);
                         T relProb = y[yOffset2];
                         cumProb += relProb;
 
-                        if (prob <= cumProb || f == yLength - 1) {                            
-                            
-                            auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer, xLength);
+                        if (prob <= cumProb || f == yLength - 1) {
+
+                            auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer);
                             z[zOffset2] = x[xOffset2];
                             f += yLength;
                         }
@@ -179,25 +179,25 @@ namespace randomOps {
                         }
                     }
                 }
-            } 
+            }
             else {
 
                 PRAGMA_OMP_PARALLEL_FOR_THREADS(_threads)
                 for (Nd4jLong i = 0; i < zLength; i++) {
 
-                    auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer, zLength);
+                    auto zOffset2 = shape::getIndexOffset(i, zShapeBuffer);
                     T prob = rng->relativeT<T>(i);
                     T cumProb = (T) 0.0f;
 
                     for (Nd4jLong f = 0; f < yLength; f++) {
-                        
-                        auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer, yLength);
+
+                        auto yOffset2 = shape::getIndexOffset(f, yShapeBuffer);
                         T relProb = y[yOffset2];
                         cumProb += relProb;
 
-                        if (prob <= cumProb || f == yLength - 1) {                        
-                            
-                            auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer, xLength);
+                        if (prob <= cumProb || f == yLength - 1) {
+
+                            auto xOffset2 = shape::getIndexOffset(f, xShapeBuffer);
                             z[zOffset2] = x[xOffset2];
                             break;
                         }
@@ -571,8 +571,8 @@ namespace randomOps {
             }
         }
     };
-    
-//////////////////////////////////////////////////////////////////////        
+
+//////////////////////////////////////////////////////////////////////
     // This Op produces random Gaussian values within [mean-2*stddev,mean+2*stddev]
     template<typename T>
     class TruncatedNormalDistribution {
diff --git a/libnd4j/include/pairwise_util.h b/libnd4j/include/pairwise_util.h
index c4b84bee2..e87e8961d 100755
--- a/libnd4j/include/pairwise_util.h
+++ b/libnd4j/include/pairwise_util.h
@@ -50,7 +50,7 @@ namespace shape {
     Nd4jLong elementWiseStride(const Nd4jLong *shapeInfo);
     char order(const Nd4jLong *shapeInfo);
     bool isStrideSimple(const Nd4jLong* shapeInfo);
-    Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo, Nd4jLong arrLen);
+    Nd4jLong getIndexOffset(Nd4jLong index, const Nd4jLong *shapeInfo);
 }
 
  */
@@ -269,13 +269,13 @@ public:
     Nd4jLong chunks;
     Nd4jLong modulo;
     Nd4jLong remainder;
-    
+
     BlockInformation(Nd4jLong length, int threshold) {
 
     threads = length / threshold;
     threads = nd4j::math::nd4j_max<int>(1, threads);
     threads = nd4j::math::nd4j_min<int>(threads, omp_get_max_threads());
-    
+
     items = length / threads;
     remainder = length % threads;
     if(items < 1)
diff --git a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
index d8cf86495..a5664a24b 100644
--- a/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ConvolutionTests1.cpp
@@ -403,7 +403,7 @@ TYPED_TEST(TypedConvolutionTests1, sconv2d_3) {
     auto input = NDArrayFactory::create<TypeParam>('c', {3, 3, 8, 8});
     auto weightsD = NDArrayFactory::create<TypeParam>('c', {1, 3, 1, 1});
     auto weightsP = NDArrayFactory::create<TypeParam>('c', {2, 3, 1, 1});
-    auto bias = NDArrayFactory::create<TypeParam>('c', {1, 2});
+    auto bias = NDArrayFactory::create<TypeParam>('c', {2});
     auto output = NDArrayFactory::create<TypeParam>('c', {3, 2, 8, 8});
     output.assign(0.0);
 
@@ -911,7 +911,7 @@ TEST_F(ConvolutionTests1, TestDeconv_ff_2) {
 
     auto input = NDArrayFactory::create<double>('c', {3, 3, 4, 4});
     auto weights = NDArrayFactory::create<double>('c',{3, 2, 1, 1});
-    auto bias = NDArrayFactory::create<double>('c', {1, 2});
+    auto bias = NDArrayFactory::create<double>('c', {2});
 
     input.linspace(1);
     weights.linspace(1);
@@ -935,11 +935,11 @@ TEST_F(ConvolutionTests1, TestDeconv_ff_2) {
 TYPED_TEST(TypedConvolutionTests1, Test_Conv1D_ff_1) {
     auto input = NDArrayFactory::create<TypeParam>('c', {2, 2, 6});
     auto weights = NDArrayFactory::create<TypeParam>('c', {2, 2, 3}, {1,5,9,3,7,11,2,6,10,4,8,12});
-    auto bias = NDArrayFactory::create<TypeParam>('c', {1, 3});
+    auto bias = NDArrayFactory::create<TypeParam>('c', {3});
     auto expFF = NDArrayFactory::create<TypeParam>('c', {2, 3, 5}, {59.0, 69.0, 79.0, 89.0, 99.0, 132.0, 158.0, 184.0, 210.0, 236.0, 205.0, 247.0, 289.0, 331.0, 373.0, 179.0, 189.0, 199.0, 209.0, 219.0, 444.0, 470.0, 496.0, 522.0, 548.0, 709.0, 751.0, 793.0, 835.0, 877.0});
     auto expEps = NDArrayFactory::create<TypeParam>('c', {2, 2, 6}, {130.0, 293.0, 326.0, 359.0, 392.0, 220.0, 166.0, 371.0, 416.0, 461.0, 506.0, 280.0, 355.0, 788.0, 821.0, 854.0, 887.0, 490.0, 481.0, 1046.0, 1091.0, 1136.0, 1181.0, 640.0});
     auto expGW = NDArrayFactory::create<TypeParam>('c', {3, 2, 2}, {1415.0, 1520.0, 2045.0, 2150.0, 1865.0, 2020.0, 2795.0, 2950.0, 2315.0, 2520.0, 3545.0, 3750.0});
-    auto expGB = NDArrayFactory::create<TypeParam>('c', {1, 3}, {105.0, 155.0, 205.0});
+    auto expGB = NDArrayFactory::create<TypeParam>('c', {3}, {105.0, 155.0, 205.0});
 
     expGW.permutei({2,1,0});
     input.linspace(1);
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
index 01e8e82c2..7428539f3 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests11.cpp
@@ -47,7 +47,7 @@ TEST_F(DeclarableOpsTests11, test_mixed_biasadd_1) {
     auto exp = NDArrayFactory::create<float>('c', {2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f});
 
     nd4j::ops::biasadd op;
-    auto status = op.execute({&x, &y}, {&z}, {}, {}, {});
+    auto status = op.execute({&x, &y}, {&z}, {}, {}, {true});
     ASSERT_EQ(Status::OK(), status);
 
     ASSERT_EQ(exp, z);
@@ -66,11 +66,11 @@ TEST_F(DeclarableOpsTests11, test_listdiff_1) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test1) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-12.49997,-13.04346, -13.63635, -14.28571,-14.99999,-15.78947, -16.66666, -17.64705,-18.75   ,-20.     , -21.42857, -23.07692,
                                    -24.99999,-27.27272, -29.99999, -33.33332,-37.49999,-42.85713, -49.99998, -59.99998,-74.99995,-99.99992,-149.99986,-299.99911});
     NDArray dLdwExp('c', {2,3,4}, {3.21887,  4.96807,  6.10512,  6.80726,  7.15461,  7.19051,  6.93973,  6.41584,  5.62456,  4.56548,  3.2326 ,  1.61444,
@@ -80,14 +80,14 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test1) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {0}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -103,16 +103,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test1) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test2) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {2,1,4}, {15.99805, 16.72406, 16.27746,  14.83754,-44.97147,-59.99582,-79.28771,-107.35497});
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {0});
@@ -120,7 +120,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -129,11 +129,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test2) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test3) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-12.49997,-13.04346, -13.63635, -14.28571,-14.99999,-15.78947, -16.66666, -17.64705,-18.75   ,-20.     , -21.42857, -23.07692,
                                    -24.99999,-27.27272, -29.99999, -33.33332,-37.49999,-42.85713, -49.99998, -59.99998,-74.99995,-99.99992,-149.99986,-299.99911});
     NDArray dLdwExp('c', {}, {-227.77286});
@@ -142,14 +142,14 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test3) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -165,22 +165,22 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test4) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {1,3,1}, {4.8876 , -46.29156, -186.36887});
-  
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
+
     auto *dLdw = results->at(1);
     // dLdw->printIndexedBuffer();
     // dLdw->printShapeInfo();
@@ -193,11 +193,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test4) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test5) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-1.04166,-1.08696, -1.13636, -1.19048,-1.25   ,-1.31579, -1.38889, -1.47059,-1.5625 ,-1.66667, -1.78571, -1.92308,
                                    -2.08333,-2.27273, -2.5    , -2.77778,-3.125  ,-3.57143, -4.16667, -5.     ,-6.25   ,-8.33333,-12.49999,-24.99993});
     NDArray dLdwExp('c', {2,3,4}, {1.05912, 1.20488, 1.29964, 1.35815, 1.3871 , 1.39009, 1.36919, 1.32553, 1.25959, 1.17133, 1.06026, 0.92541,
@@ -207,14 +207,14 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test5) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -230,16 +230,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test5) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test6) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {1,3,1}, {6.73432, 2.46939,-9.20372});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {2});
@@ -247,7 +247,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test6) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -256,16 +256,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test6) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test7) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {}, {0.});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {2});
@@ -273,20 +273,20 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test7) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test8) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0.     , 0.     ,  0.     ,  0.     ,-1.5    ,-1.57895, -1.66667, -1.76471,-1.875  ,-2.     , -2.14286, -2.30769,
                                   -2.5    ,-2.72727, -3.     , -3.33333,-3.75   ,-4.28571, -5.     , -6.     ,-7.49999,-9.99999,-14.99999,-29.99991});
     NDArray dLdwExp('c', {2,3,4}, {1.56625, 1.74117, 1.85487, 1.92509, 1.95982, 1.96341, 1.93833, 1.88594, 1.80682, 1.70091, 1.56762, 1.4058 ,
@@ -307,7 +307,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test8) {
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -323,11 +323,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test8) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test9) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.52083,-0.54348,-0.56818, -0.59524,-0.625  ,-0.65789,-0.69444, -0.73529,-0.78125,-0.83333,-0.89286, -0.96154,
                                    -1.04167,-1.13636,-1.25   , -1.38889,-1.5625 ,-1.78571,-2.08333, -2.5    ,-3.125  ,-4.16666,-6.24999,-12.49996});
     NDArray dLdwExp('c', {2,3,4}, {0.13412, 0.207  , 0.25438, 0.28364, 0.29811, 0.2996 , 0.28916, 0.26733, 0.23436, 0.19023, 0.13469, 0.06727,
@@ -338,13 +338,13 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test9) {
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
-    
+
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -357,10 +357,10 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test9) {
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test10) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
@@ -375,8 +375,8 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test10) {
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -386,7 +386,7 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test10) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test11) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
@@ -401,8 +401,8 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test11) {
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -412,11 +412,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test11) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test12) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, { 0.      , 0.      , 0.      ,  0.      ,-0.75    ,-0.789473,-0.833333, -0.882353,-0.9375  ,-1.      ,-1.071428, -1.153846,
                                 -1.25    ,-1.363636,-1.5     , -1.666666,-1.875   ,-2.142857,-2.499999, -2.999999,-3.749997,-4.999997,-7.499993,-14.999956});
     NDArray dLdwExp('c', {2,3,4}, {0.16094, 0.2484 , 0.30526, 0.34036, 0.35773, 0.35953, 0.34699, 0.32079, 0.28123, 0.22827, 0.16163, 0.08072,
@@ -433,15 +433,15 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test12) {
     weights.t<double>(2) = 0.;
     weights.t<double>(3) = 0.;
 
-    
+
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -455,11 +455,11 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test12) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, log_loss_grad_test13) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0.     , 0.     ,  0.     ,  0.     , 0.     , 0.     ,  0.     ,  0.     , 0.     , 0.     ,  0.     ,  0.     ,
                                   -2.08333,-2.27273, -2.5    , -2.77778,-3.125  ,-3.57143, -4.16667, -5.     ,-6.25   ,-8.33333,-12.49999,-24.99993});
     NDArray dLdwExp('c', {2,3,1}, {1.75828,  2.30839,  1.25309, -1.35098, -6.16602,-16.78383});
@@ -471,16 +471,16 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test13) {
     weights.assign(0.5);
     weights.t<double>(0) = 0.;
     weights.t<double>(1) = 0.;
-    weights.t<double>(2) = 0.;    
-    
+    weights.t<double>(2) = 0.;
+
     nd4j::ops::log_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {1e-7}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -494,10 +494,10 @@ TEST_F(DeclarableOpsTests11, log_loss_grad_test13) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, summaryStatsData_test1) {
-    
+
     functions::summarystats::SummaryStatsData<double> var1;
     functions::summarystats::SummaryStatsData<double> var2;
-    var2.n = var2.mean = var2.M2 = var2.M3 = var2.M4 = var2.bias = 5; 
+    var2.n = var2.mean = var2.M2 = var2.M3 = var2.M4 = var2.bias = 5;
 
     functions::summarystats::SummaryStatsData<double>* arr = new functions::summarystats::SummaryStatsData<double>[2];
     arr[0] = var1;
@@ -515,11 +515,11 @@ TEST_F(DeclarableOpsTests11, summaryStatsData_test1) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.96, -1.92, -2.88, -3.84, -4.8 , -5.76, -6.72, -7.68, -8.64, -9.6 ,-10.56,-11.52,
                                    -12.48,-13.44,-14.4 ,-15.36,-16.32,-17.28,-18.24,-19.2 ,-20.16,-21.12,-22.08,-23.04});
     NDArray dLdwExp('c', {2,3,4}, {0.9216 ,  3.6864 ,  8.2944 , 14.7456 , 23.04   , 33.1776 , 45.1584 , 58.9824 , 74.6496 , 92.16   ,111.51361,132.7104 ,
@@ -527,14 +527,14 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto dLdp = results->at(0);       
+    auto dLdp = results->at(0);
     auto dLdw = results->at(1);
     auto dLdl = results->at(2);
 
@@ -547,19 +547,19 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test1) {
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {2,1,4}, {98.61121,129.024  , 164.9664 , 206.4384 , 828.51837,925.28644,1027.58398,1135.41113});
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {0});
@@ -567,7 +567,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -576,25 +576,25 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test2) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.96, -1.92, -2.88, -3.84, -4.8 , -5.76, -6.72, -7.68, -8.64, -9.6 ,-10.56,-11.52,
                                    -12.48,-13.44,-14.4 ,-15.36,-16.32,-17.28,-18.24,-19.2 ,-20.16,-21.12,-22.08,-23.04});
     NDArray dLdwExp('c', {}, {4515.84});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -610,22 +610,22 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {1,3,1}, {807.32153, 1426.63684, 2281.88159});
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
+
     auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
@@ -636,11 +636,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test4) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.08,-0.16,-0.24,-0.32,-0.4 ,-0.48,-0.56,-0.64,-0.72,-0.8 ,-0.88,-0.96,
                                    -1.04,-1.12,-1.2 ,-1.28,-1.36,-1.44,-1.52,-1.6 ,-1.68,-1.76,-1.84,-1.92});
     NDArray dLdwExp('c', {2,3,4}, {-15.6032,-15.3728,-14.9888,-14.4512,-13.76  ,-12.9152,-11.9168,-10.7648, -9.4592, -8.    , -6.3872, -4.6208,
@@ -648,14 +648,14 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -671,16 +671,16 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test5) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {1,3,1}, {-58.16319, -6.5536 , 64.71682});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {2});
@@ -688,7 +688,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -697,16 +697,16 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test6) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {}, {0.});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {2});
@@ -714,20 +714,20 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test7) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0. ,0. ,0. ,0. ,-0.48 ,-0.576,-0.672,-0.768,-0.864,-0.96 ,-1.056,-1.152,
                                  -1.248,-1.344,-1.44 ,-1.536,-1.632,-1.728,-1.824,-1.92 ,-2.016,-2.112,-2.208,-2.304});
     NDArray dLdwExp('c', {2,3,4}, {-22.3488 ,-22.07232,-21.61152,-20.9664 ,-20.13696,-19.1232 ,-17.92512,-16.54272,-14.976  ,-13.22496,-11.2896 , -9.16992,
@@ -746,7 +746,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) {
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -762,11 +762,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test8) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.04,-0.08,-0.12,-0.16,-0.2 ,-0.24,-0.28,-0.32,-0.36,-0.4 ,-0.44,-0.48,
                                    -0.52,-0.56,-0.6 ,-0.64,-0.68,-0.72,-0.76,-0.8 ,-0.84,-0.88,-0.92,-0.96});
     NDArray dLdwExp('c', {2,3,4}, {0.0384, 0.1536, 0.3456, 0.6144, 0.96  , 1.3824, 1.8816, 2.4576, 3.1104, 3.84  , 4.6464, 5.5296,
@@ -775,13 +775,13 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) {
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
-    
+
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -797,7 +797,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test9) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
@@ -812,8 +812,8 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) {
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -823,7 +823,7 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test10) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
@@ -838,8 +838,8 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) {
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -849,11 +849,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test11) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0.,0.,0.,0., -0.24 ,-0.288,-0.336,-0.384,-0.432,-0.48 ,-0.528,-0.576,
                                   -0.624,-0.672,-0.72 ,-0.768,-0.816,-0.864,-0.912,-0.96 ,-1.008,-1.056,-1.104,-1.152});
     NDArray dLdwExp('c', {2,3,4}, {0.04608, 0.18432, 0.41472, 0.73728, 1.152  , 1.65888, 2.25792, 2.94912, 3.73248, 4.608  , 5.57568, 6.63552,
@@ -866,15 +866,15 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) {
     weights.t<double>(1) = 0.;
     weights.t<double>(2) = 0.;
     weights.t<double>(3) = 0.;
-    
+
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -888,11 +888,11 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test12) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test13) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                                   -1.04,-1.12,-1.2 ,-1.28,-1.36,-1.44,-1.52,-1.6 ,-1.68,-1.76,-1.84,-1.92});
     NDArray dLdwExp('c', {2,3,1}, {2.304  , 13.3632 , 34.2528 , 64.97279,105.5232 ,155.90401});
@@ -902,16 +902,16 @@ TEST_F(DeclarableOpsTests11, mean_sqerr_loss_grad_test13) {
     weights.assign(0.5);
     weights.t<double>(0) = 0.;
     weights.t<double>(1) = 0.;
-    weights.t<double>(2) = 0.;    
-    
+    weights.t<double>(2) = 0.;
+
     nd4j::ops::mean_sqerr_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -963,11 +963,11 @@ TEST_F(DeclarableOpsTests11, SquaredSubtractTest_Test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,
                                    -0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5});
     NDArray dLdwExp('c', {2,3,4}, {0.96, 1.92, 2.88, 3.84, 4.8 , 5.76, 6.72, 7.68, 8.64, 9.6 ,10.56,11.52,
@@ -975,14 +975,14 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto dLdp = results->at(0);       
+    auto dLdp = results->at(0);
     auto dLdw = results->at(1);
     auto dLdl = results->at(2);
 
@@ -995,19 +995,19 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test1) {
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {2,1,4}, {14.4 , 17.28, 20.16, 23.04, 48.96, 51.84, 54.72, 57.6});
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {0});
@@ -1015,7 +1015,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -1024,25 +1024,25 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test2) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,
                                    -0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5});
     NDArray dLdwExp('c', {}, {288.});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1058,22 +1058,22 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {1,3,1}, {65.28, 96., 126.72001});
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
+
     auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
@@ -1084,11 +1084,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test4) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,
                                    -0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167,-0.04167});
     NDArray dLdwExp('c', {2,3,4}, {-0.92,-0.84,-0.76,-0.68,-0.6 ,-0.52,-0.44,-0.36,-0.28,-0.2 ,-0.12,-0.04,
@@ -1096,14 +1096,14 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) {
 
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1119,16 +1119,16 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test5) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {1,3,1}, {-2.56, 0., 2.56});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {2});
@@ -1136,7 +1136,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -1145,16 +1145,16 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test6) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {}, {0.});
-    
+
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {2});
@@ -1162,20 +1162,20 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test7) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.  ,-0.  ,-0.  ,-0.  ,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,
                                    -0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05});
     NDArray dLdwExp('c', {2,3,4}, {-1.296,-1.2  ,-1.104,-1.008,-0.912,-0.816,-0.72 ,-0.624,-0.528,-0.432,-0.336,-0.24 ,
@@ -1194,7 +1194,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) {
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1210,11 +1210,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test8) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,
                                    -0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083,-0.02083, -0.02083, -0.02083, -0.02083});
     NDArray dLdwExp('c', {2,3,4}, {0.04, 0.08, 0.12, 0.16, 0.2 , 0.24, 0.28, 0.32,0.36, 0.4 , 0.44, 0.48,
@@ -1223,13 +1223,13 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) {
     predictions.linspace(0.04, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
-    
+
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1245,7 +1245,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test9) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
@@ -1260,8 +1260,8 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) {
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -1271,7 +1271,7 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test10) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
@@ -1286,8 +1286,8 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) {
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -1297,11 +1297,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test11) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., -0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025,
                                    -0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025,-0.025, -0.025, -0.025, -0.025});
     NDArray dLdwExp('c', {2,3,4}, {0.048, 0.096, 0.144, 0.192,0.24 , 0.288, 0.336, 0.384,0.432, 0.48 , 0.528, 0.576,
@@ -1314,15 +1314,15 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) {
     weights.t<double>(1) = 0.;
     weights.t<double>(2) = 0.;
     weights.t<double>(3) = 0.;
-    
+
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -1336,11 +1336,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test12) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray predictions('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0., 0., 0., 0., 0., 0., 0., 0.,0., 0., 0., 0.,
                                   -0.04167, -0.04167, -0.04167, -0.04167,-0.04167, -0.04167, -0.04167, -0.04167,-0.04167, -0.04167, -0.04167, -0.04167});
     NDArray dLdwExp('c', {2,3,1}, {0.8 ,2.08,3.36,4.64,5.92,7.2 });
@@ -1350,16 +1350,16 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) {
     weights.assign(0.5);
     weights.t<double>(0) = 0.;
     weights.t<double>(1) = 0.;
-    weights.t<double>(2) = 0.;    
-    
+    weights.t<double>(2) = 0.;
+
     nd4j::ops::absolute_difference_loss_grad op;
     auto results = op.execute({&predictions, &weights, &labels}, {}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -1373,11 +1373,11 @@ TEST_F(DeclarableOpsTests11, absolute_difference_loss_grad_test13) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, BFloat16_Test_1) {
-    
+
     NDArray x = NDArrayFactory::create<bfloat16>('c', {2,3,4});
     NDArray y = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
     NDArray exp = NDArrayFactory::create<bfloat16>('c', {2,3,4});//('c', {2,3,4}, nd4j::DataType::BFLOAT16);
-    
+
     x.linspace(1);
     y.linspace(1);
     exp.linspace(2,2);
@@ -1385,7 +1385,7 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_1) {
     auto results = op.execute({&x, &y}, {}, {});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
+
     auto res = results->at(0);
     res->printIndexedBuffer("BFloat16 sum:");
     ASSERT_TRUE(res->equalsTo(exp));
@@ -1439,11 +1439,11 @@ TEST_F(DeclarableOpsTests11, BFloat16_Test_3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.25999, -0.755  , -1.25   , -1.745  , -2.24001, -2.73502, -3.23004, -3.72508, -4.22014, -4.71523, -5.21034, -5.70548,
                                    -6.20066, -6.69587, -7.19113, -7.68643, -8.18177, -8.67717, -9.17262, -9.66813,-10.1637 ,-10.65932,-11.15501,-11.65077});
     NDArray dLdwExp('c', {2,3,4}, {0.73395,  0.75335,  0.69315,  0.55335,  0.33395,  0.03495, -0.34366, -0.80186, -1.33967, -1.95708, -2.65411, -3.43074,
@@ -1453,14 +1453,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) {
 
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1473,14 +1473,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test1) {
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,1,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.18499,-0.53   ,-0.875  ,-1.22   ,-1.56501,-1.91002,-2.25504,-2.60008,-2.94514,-3.29023,-3.63534,-3.98048,
                                    -4.32566,-4.67087,-5.01613,-5.36143,-5.70677,-6.05217,-6.39762,-6.74313,-7.0887 ,-7.43432,-7.78001,-8.12577});
     NDArray dLdwExp('c', {2,1,4}, {0.43622, -0.19079, -0.98462, -1.94525,-18.09855,-20.72768,-23.52373,-26.48669});
@@ -1489,14 +1489,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) {
 
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {0});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1512,11 +1512,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test2) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.18499,-0.53   ,-0.875  ,-1.22   ,-1.56501,-1.91002,-2.25504,-2.60008,-2.94514,-3.29023,-3.63534,-3.98048,
                                    -4.32566,-4.67087,-5.01613,-5.36143,-5.70677,-6.05217,-6.39762,-6.74313,-7.0887 ,-7.43432,-7.78001,-8.12577});
     NDArray dLdwExp('c', {}, {-91.52109});
@@ -1525,14 +1525,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) {
 
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1548,22 +1548,22 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test3) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdwExp('c', {1,3,1}, {-12.54779,-28.13393,-50.83936});
-  
+
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {1});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
+
     auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
@@ -1574,11 +1574,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test4) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.01542,-0.04417,-0.07292,-0.10167,-0.13042,-0.15917,-0.18792,-0.21667,-0.24543,-0.27419,-0.30294,-0.33171,
                                    -0.36047,-0.38924,-0.41801,-0.44679,-0.47556,-0.50435,-0.53314,-0.56193,-0.59072,-0.61953,-0.64833,-0.67715});
     NDArray dLdwExp('c', {2,3,4}, {0.37794, 0.37906, 0.37554, 0.36739, 0.35461, 0.33719, 0.31514, 0.28846, 0.25714, 0.22119, 0.18061, 0.13539,
@@ -1588,14 +1588,14 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) {
 
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1611,16 +1611,16 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test5) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {1,3,1}, {1.4966 , 0.19776,-1.69436});
 
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2});
@@ -1628,7 +1628,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
@@ -1637,16 +1637,16 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test6) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights(nd4j::DataType::DOUBLE);
-        
+
     NDArray dLdwExp('c', {}, {0.});
-    
+
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2});
@@ -1654,20 +1654,20 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test7) {
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
     auto *dLdw = results->at(1);
-    
+
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, { 0.     , 0.     , 0.     , 0.     ,-0.1565 ,-0.191  ,-0.2255 ,-0.26001,-0.29451,-0.32902,-0.36353,-0.39805,
                                    -0.43257,-0.46709,-0.50161,-0.53614,-0.57068,-0.60522,-0.63976,-0.67431,-0.70887,-0.74343,-0.778  ,-0.81258});
     NDArray dLdwExp('c', {2,3,4}, {0.54353, 0.54487, 0.54065, 0.53087, 0.51553, 0.49463, 0.46817, 0.43615, 0.39857, 0.35543, 0.30672, 0.25246,
@@ -1687,7 +1687,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) {
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1703,11 +1703,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test8) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.00771, -0.02208, -0.03646, -0.05083,-0.06521, -0.07958, -0.09396, -0.10834,-0.12271, -0.13709, -0.15147, -0.16585,
                                    -0.18024, -0.19462, -0.20901, -0.22339,-0.23778, -0.25217, -0.26657, -0.28096,-0.29536, -0.30976, -0.32417, -0.33857});
     NDArray dLdwExp('c', {2,3,4}, {0.03008,  0.03064,  0.02888,  0.02481, 0.01841,  0.00971, -0.00132, -0.01466,-0.03032, -0.0483 , -0.06859, -0.0912 ,
@@ -1717,13 +1717,13 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) {
     logits.linspace(-0.08, 0.04);
     labels.linspace(1);
     weights.assign(0.5);
-    
+
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
@@ -1736,10 +1736,10 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test9) {
 
     delete results;
 }
- 
+
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,1}, nd4j::DataType::DOUBLE);
@@ -1754,8 +1754,8 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) {
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -1765,7 +1765,7 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test10) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3,1}, nd4j::DataType::DOUBLE);
@@ -1780,8 +1780,8 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) {
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
-        
-    auto *dLdw = results->at(1);    
+
+    auto *dLdw = results->at(1);
 
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
     ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
@@ -1791,11 +1791,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test11) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0.     ,  0.     ,  0.     ,  0.     ,-0.07825, -0.0955 , -0.11275, -0.13   ,-0.14726, -0.16451, -0.18177, -0.19902,
                                    -0.21628, -0.23354, -0.25081, -0.26807,-0.28534, -0.30261, -0.31988, -0.33716,-0.35443, -0.37172, -0.389  , -0.40629});
     NDArray dLdwExp('c', {2,3,4}, {0.0361 ,  0.03677,  0.03466,  0.02977, 0.0221 ,  0.01165, -0.00158, -0.01759,-0.03638, -0.05795, -0.08231, -0.10944,
@@ -1810,15 +1810,15 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) {
     weights.t<double>(2) = 0.;
     weights.t<double>(3) = 0.;
 
-    
+
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -1832,11 +1832,11 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test12) {
 
 ///////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) {
-    
+
     NDArray labels('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2,3,1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {0.     ,  0.     ,  0.     ,  0.     , 0.     ,  0.     ,  0.     ,  0.     , 0.     ,  0.     ,  0.     ,  0.     ,
                                    -0.36047, -0.38924, -0.41801, -0.44679,-0.47556, -0.50435, -0.53314, -0.56193,-0.59072, -0.61953, -0.64833, -0.67715});
     NDArray dLdwExp('c', {2,3,1}, {0.22882, 0.02428,-0.4768 ,-1.27447,-2.36878,-3.75981,});
@@ -1847,16 +1847,16 @@ TEST_F(DeclarableOpsTests11, sigm_cross_entropy_loss_grad_test13) {
     weights.assign(0.5);
     weights.t<double>(0) = 0.;
     weights.t<double>(1) = 0.;
-    weights.t<double>(2) = 0.;    
-    
+    weights.t<double>(2) = 0.;
+
     nd4j::ops::sigm_cross_entropy_loss_grad op;
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {3});
 
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -1940,61 +1940,61 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test1) {
     NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, nd4j::DataType::INT32);
     NDArray logits('c', {2,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,4}, {0.1176,  0.1224, -0.3726,  0.1326, 0.1176, -0.3776,  0.1274,  0.1326});
     NDArray dLdwExp('c', {2}, {1.36729, 1.40729});
 
     logits.linspace(-0.08, 0.04);
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
-    auto results = op.execute({&logits, &weights, &labels}, {0.}, {0});    
-    
+    auto results = op.execute({&logits, &weights, &labels}, {0.}, {0});
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
- 
+
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test2) {
 
     NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
     NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {4}, {0.125,  0.125, -0.375,  0.125});
     NDArray dLdwExp('c', {1}, {1.38629});
 
     logits = 2.;
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {1});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
-}   
+}
 
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) {
@@ -2002,30 +2002,30 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test3) {
     NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
     NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {}, {0}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {4}, {0.125,  0.125, -0.375,  0.125});
     NDArray dLdwExp('c', {}, {1.38629});
 
     logits = 2.;
-    weights.assign(0.5);    
+    weights.assign(0.5);
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {1});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
-}   
+}
 
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) {
@@ -2033,30 +2033,30 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test4) {
     NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
     NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {}, {0}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {4}, {0.23521,  0.2448 , -0.7452 ,  0.26519});
     NDArray dLdwExp('c', {}, {0.});
 
     logits.linspace(-0.08, 0.04);
-    weights = 0.5;    
+    weights = 0.5;
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {2});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
-}   
+}
 
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) {
@@ -2064,30 +2064,30 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test5) {
     NDArray labels('c', {4}, {0,0,1,0}, nd4j::DataType::INT32);
     NDArray logits('c', {4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {4}, {0.1176,  0.1224, -0.3726,  0.1326});
     NDArray dLdwExp('c', {1}, {1.36729});
 
     logits.linspace(-0.08, 0.04);
-    weights = 0.5;    
+    weights = 0.5;
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {3});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
-}   
+}
 
 /////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) {
@@ -2095,7 +2095,7 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) {
     NDArray labels('c', {2,4}, {0,0,1,0, 0,1,0,0}, nd4j::DataType::INT32);
     NDArray logits('c', {2,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {2}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,4}, {0.0801,  0.0849, -0.2601,  0.0951, 0.0801, -0.2651,  0.0899,  0.0951});
     NDArray dLdwExp('c', {2}, {-0.014000, 0.014000});
 
@@ -2105,12 +2105,12 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test6) {
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.3}, {2});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
+    auto *dLdl = results->at(2);
 
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
@@ -2126,27 +2126,27 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test7) {
     NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0}, nd4j::DataType::INT32);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,3}, {0.5, 0., 1.5});
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.0956 , 0.0306 , 0.03185, 0.03315, 0.,-0., 0., 0., 0.0882 , 0.0918 ,-0.27945, 0.09945,
                                    0.0294 , 0.0306 , 0.03185,-0.09185,-0., 0., 0., 0., 0.0882 ,-0.2832 , 0.09555, 0.09945});
     NDArray dLdwExp('c', {1,3}, {0.69365, 0.71365, 0.69365});
 
-    logits.linspace(-0.08, 0.04);    
+    logits.linspace(-0.08, 0.04);
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {3});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
-    auto *dLdl = results->at(2);    
-    
+    auto *dLdl = results->at(2);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
@@ -2157,40 +2157,40 @@ TEST_F(DeclarableOpsTests11, softmax_cross_entropy_loss_grad_test8) {
     NDArray labels('c', {2,3,4,5}, {1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,
                                     0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,
                                     0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0}, nd4j::DataType::INT32);
-                                    
+
     NDArray logits('c', {2,3,4,5}, nd4j::DataType::DOUBLE);
     NDArray weights('c', {1,1,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4,5}, {-0.03399, 0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335,
-                                    0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 
-                                    0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 
-                                    0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 
-                                    0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 
-                                    0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, 
-                                    0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 0.00768, 
-                                    0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, 0.00866, 
+                                    0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399,
+                                    0.00799, 0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866,
+                                    0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799,
+                                    0.00832, 0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901,
+                                    0.00768, 0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832,
+                                    0.00866, 0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 0.00768,
+                                    0.00799, 0.00832,-0.03301, 0.00901, 0.00768, 0.00799, 0.00832, 0.00866,-0.03265,-0.03399, 0.00799, 0.00832, 0.00866,
                                     0.00901, 0.00768,-0.03367, 0.00832, 0.00866, 0.00901, 0.00768, 0.00799,-0.03335, 0.00866, 0.00901, 0.00768, 0.00799, 0.00832,-0.03301, 0.00901});
 
     NDArray dLdwExp('c', {1,1,4}, {0.005,  0.00167, -0.00167, -0.005});
-    logits.linspace(-0.08, 0.04);    
-    weights.assign(0.5);    
+    logits.linspace(-0.08, 0.04);
+    weights.assign(0.5);
 
     nd4j::ops::softmax_cross_entropy_loss_grad op;
 
     auto results = op.execute({&logits, &weights, &labels}, {0.}, {2});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);       
+    auto *dLdp = results->at(0);
     auto *dLdw = results->at(1);
     auto *dLdl = results->at(2);
 
-    // dLdp->printIndexedBuffer(); 
+    // dLdp->printIndexedBuffer();
 
     // ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     // ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
     ASSERT_TRUE(dLdwExp.isSameShape(dLdw));
-    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));    
+    ASSERT_TRUE(dLdwExp.equalsTo(dLdw));
 
     delete results;
 }
@@ -2212,19 +2212,19 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test1) {
 
     NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1, 1,0,0,0, 0,1,0,0});
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.76479, 0.2448, 0.2548, 0.26519, 0.23521,-0.7552, 0.2548, 0.26519, 0.23521, 0.2448,-0.7452, 0.26519,
                                    0.23521, 0.2448, 0.2548,-0.73481,-0.76479, 0.2448, 0.2548, 0.26519, 0.23521,-0.7552, 0.2548, 0.26519});
-    logits.linspace(-0.08, 0.04);    
+    logits.linspace(-0.08, 0.04);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);    
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2236,19 +2236,19 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test2) {
 
     NDArray labels('c', {2,3,4}, {1,0,0,0, 0,1,0,1, 0,0,1,0, 0,0,0,1, 1,0,1,0, 0,1,0,0});
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.71836,  0.28164,  0.28164,  0.28164, 0.33051, -0.66949,  0.33051, -0.66949, 0.38785,  0.38785, -0.61215,  0.38785,
                                     0.28164,  0.28164,  0.28164, -0.71836,-0.66949,  0.33051, -0.66949,  0.33051, 0.38785, -0.61215,  0.38785,  0.38785});
-    logits.linspace(-0.08, 0.04);    
+    logits.linspace(-0.08, 0.04);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {1});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2260,18 +2260,18 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test3) {
 
     NDArray labels('c', {2,3}, {1,0,0, 0,1,1});
     NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3}, {-0.52996,  0.47004,  0.47004, 0.52996, -0.47004, -0.47004});
-    logits.linspace(-0.08, 0.04);    
+    logits.linspace(-0.08, 0.04);
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {0});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2283,17 +2283,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test4) {
 
     NDArray labels('c', {2,1}, {1,1});
     NDArray logits('c', {2,1}, {-0.04, 0.04});
-    
-    NDArray dLdpExp('c', {2,1}, {0., 0.});    
+
+    NDArray dLdpExp('c', {2,1}, {0., 0.});
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {1});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2305,17 +2305,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test5) {
 
     NDArray labels('c', {2,1}, {1,0});
     NDArray logits('c', {2,1}, {-0.04, 0.04});
-    
-    NDArray dLdpExp('c', {2,1}, {-0.51999, 0.51999});    
+
+    NDArray dLdpExp('c', {2,1}, {-0.51999, 0.51999});
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {0});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2327,17 +2327,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test6) {
 
     NDArray labels('c', {1,2}, {1,1});
     NDArray logits('c', {1,2}, {-0.04, 0.04});
-    
-    NDArray dLdpExp('c', {1,2}, {0, 0});    
+
+    NDArray dLdpExp('c', {1,2}, {0, 0});
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {0});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2349,17 +2349,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test7) {
 
     NDArray labels('c', {2}, {0,1});
     NDArray logits('c', {2}, {-0.04, 0.04});
-    
+
     NDArray dLdpExp('c', {2}, {0.48001, -0.48001});
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {0});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2371,17 +2371,17 @@ TEST_F(DeclarableOpsTests11, softmaxCrossEntropyWithLogits_grad_test8) {
 
     NDArray labels('c', {1}, {1});
     NDArray logits('c', {1}, {0.04});
-    
+
     NDArray dLdpExp('c', {1}, {0});
 
     nd4j::ops::softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&logits, &labels}, {}, {0});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);        
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2420,19 +2420,19 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test1) {
 
     NDArray labels('c', {2}, {2,1}, nd4j::DataType::INT64);
     NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3}, {0.30061,  0.33222, -0.63283, 0.30061, -0.66778,  0.36717});
-    
-    logits.linspace(0.1, 0.1);     
+
+    logits.linspace(0.1, 0.1);
 
     nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&labels, &logits}, {}, {});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);    
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2444,19 +2444,19 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test2) {
 
     NDArray labels('c', {2}, {0,1}, nd4j::DataType::INT64);
     NDArray logits('c', {2,3}, nd4j::DataType::DOUBLE);
-    
-    NDArray dLdpExp('c', {2,3}, {-0.69939,  0.33222,  0.36717, 0.30061, -0.66778,  0.36717});    
-    
+
+    NDArray dLdpExp('c', {2,3}, {-0.69939,  0.33222,  0.36717, 0.30061, -0.66778,  0.36717});
+
     logits.linspace(-0.1, 0.1);
 
     nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&labels, &logits}, {}, {});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);    
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2468,17 +2468,17 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test3) {
 
     NDArray labels('c', {}, {1}, nd4j::DataType::INT64);
     NDArray logits('c', {2}, {-0.2, 0.3});
-    
+
     NDArray dLdpExp('c', {2}, {0.37754, -0.37754});
 
     nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&labels, &logits}, {}, {});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);    
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2490,7 +2490,7 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test4) {
 
     NDArray labels('c', {2,3}, {0,1,1, 3,3,2}, nd4j::DataType::INT64);
     NDArray logits('c', {2,3,4}, nd4j::DataType::DOUBLE);
-    
+
     NDArray dLdpExp('c', {2,3,4}, {-0.78616,  0.23633,  0.26118,  0.28865, 0.21384, -0.76367,  0.26118,  0.28865, 0.21384, -0.76367,  0.26118,  0.28865,
                                   0.21384,  0.23633,  0.26118, -0.71135, 0.21384,  0.23633,  0.26118, -0.71135, 0.21384,  0.23633, -0.73882,  0.28865});
     logits.linspace(-0.5, 0.1);
@@ -2498,11 +2498,11 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test4) {
     nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&labels, &logits}, {}, {});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);    
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
@@ -2514,17 +2514,17 @@ TEST_F(DeclarableOpsTests11, sparseSoftmaxCrossEntropyWithLogits_grad_test5) {
 
     NDArray labels('c', {1,1}, {0}, nd4j::DataType::INT64);
     NDArray logits('c', {1,1,2}, {-0.3,0.2});
-    
-    NDArray dLdpExp('c', {1,1,2}, {-0.62246,  0.62246});    
+
+    NDArray dLdpExp('c', {1,1,2}, {-0.62246,  0.62246});
 
     nd4j::ops::sparse_softmax_cross_entropy_loss_with_logits_grad op;
 
     auto results = op.execute({&labels, &logits}, {}, {});
-    
+
     ASSERT_EQ(ND4J_STATUS_OK, results->status());
 
-    auto *dLdp = results->at(0);    
-    
+    auto *dLdp = results->at(0);
+
     ASSERT_TRUE(dLdpExp.isSameShape(dLdp));
     ASSERT_TRUE(dLdpExp.equalsTo(dLdp));
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
index fd32f8a79..da5f5f75d 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests4.cpp
@@ -245,7 +245,7 @@ TYPED_TEST(TypedDeclarableOpsTests4, Test_Pooling_Parity_12) {
 
 TEST_F(DeclarableOpsTests4, Test_BiasAdd_NHWC_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 3, 3, 2});
-    auto bias = NDArrayFactory::create<double>('c', {1, 2}, {1, 2});
+    auto bias = NDArrayFactory::create<double>('c', {2}, {1, 2});
     auto exp = NDArrayFactory::create<double>('c', {2, 3, 3, 2}, {1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f, 1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f});
 
     nd4j::ops::biasadd op;
@@ -263,11 +263,11 @@ TEST_F(DeclarableOpsTests4, Test_BiasAdd_NHWC_1) {
 
 TEST_F(DeclarableOpsTests4, Test_BiasAdd_NCHW_1) {
     auto x = NDArrayFactory::create<double>('c', {2, 2, 3, 3});
-    auto bias = NDArrayFactory::create<double>('c', {1, 2}, {1, 2});
-    auto exp = NDArrayFactory::create<double>('c', {2, 2, 3, 3}, {1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f, 1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f,  1.f,  2.f});
+    auto bias = NDArrayFactory::create<double>('c', {2}, {1, 2});
+    auto exp = NDArrayFactory::create<double>('c', {2, 2, 3, 3}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2});
 
     nd4j::ops::biasadd op;
-    auto result = op.execute({&x, &bias}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
+    auto result = op.execute({&x, &bias}, {}, {}, {true}, false, nd4j::DataType::DOUBLE);
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
@@ -360,6 +360,43 @@ TEST_F(DeclarableOpsTests4, Test_FlattenTests_2) {
     delete result;
 }
 
+TEST_F(DeclarableOpsTests4, Test_FlattenTests_3) {
+    NDArray x('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT32);
+    NDArray y('f', {2,2}, nd4j::DataType::INT32);
+    NDArray exp('c', {8}, {1, 2, 3, 4, 1, 2, 3, 4}, nd4j::DataType::INT32);
+
+    y.assign(x);
+
+    nd4j::ops::flatten op;
+    auto result = op.execute({&x, &y}, {}, {'c'});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
+TEST_F(DeclarableOpsTests4, Test_FlattenTests_4) {
+    NDArray x('c', {2,2}, {1, 2, 3, 4}, nd4j::DataType::INT32);
+    NDArray y('f', {2,2}, nd4j::DataType::INT32);
+    NDArray exp('c', {8}, {1, 3, 2, 4, 1, 3, 2, 4}, nd4j::DataType::INT32);
+
+    y.assign(x);
+
+    nd4j::ops::flatten op;
+    auto result = op.execute({&x, &y}, {}, {'f'});
+    ASSERT_EQ(ND4J_STATUS_OK, result->status());
+
+    auto z = result->at(0);
+    z->printIndexedBuffer();
+
+    ASSERT_TRUE(exp.equalsTo(z));
+
+    delete result;
+}
+
 TEST_F(DeclarableOpsTests4, Test_FloorTests_1) {
     auto x = NDArrayFactory::create<double>('c', {3, 3}, {1.5, 2.3, 3.4, 4.3, 5.9, 6.1, 7.2, 8.9, 9.7});
     auto exp = NDArrayFactory::create<double>('c', {3,3});
@@ -608,7 +645,7 @@ TEST_F(DeclarableOpsTests4, Test_BiasAdd_1) {
     auto exp = NDArrayFactory::create<double>('c', {2, 3}, {1, 2, 3, 1, 2, 3});
 
     nd4j::ops::biasadd op;
-    auto result = op.execute({&x, &row}, {}, {}, {}, false, nd4j::DataType::DOUBLE);
+    auto result = op.execute({&x, &row}, {}, {}, {true}, false, nd4j::DataType::DOUBLE);
 
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
index a5e808867..e6c692f5b 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests6.cpp
@@ -1610,21 +1610,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_1) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, MatrixInverse_010) {
 
-    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {
-            1.,  0.,  0.,  0.,  0.,
-            2.,  1.,  0.,  0.,  0.,
-            30.,  2.,  1.,  0.,  0.,
-            4.,  3.,  2.,  1.,  0.,
-            5.,  4.,  3.,  2.,  1.,
-    });
-
-    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {
-            1.0,  0.0,  0.0,  0.0, 0.,
-            -2.0,  1.0,   0.,   0., 0.,
-            -26.0, -2.0,    1,    0, 0.,
-            54.0,  1.0, -2.0,    1, 0.,
-            -27.0,  0.0,  1.0, -2.0, 1.
-    });
+    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.,  0.,  0.,  0.,  0.,2.,  1.,  0.,  0.,  0.,30.,  2.,  1.,  0.,  0.,4.,  3.,  2.,  1.,  0.,5.,  4.,  3.,  2.,  1.,});
+    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.0,  0.0,  0.0,  0.0, 0.,-2.0,  1.0,   0.,   0., 0.,-26.0, -2.0,    1,    0, 0.,54.0,  1.0, -2.0,    1, 0.,-27.0,  0.0,  1.0, -2.0, 1.});
 
     nd4j::ops::matrix_inverse op;
     auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
@@ -1632,8 +1619,6 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_010) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
 
     auto z = result->at(0);
-//    z->printIndexedBuffer("010 Output ");
-//    exp.printIndexedBuffer("010 Expected ");
 
     ASSERT_TRUE(exp.isSameShape(z));
     ASSERT_TRUE(exp.equalsTo(z));
@@ -1644,24 +1629,9 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_010) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, MatrixInverse_01) {
 
-    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {
-            2.,  4., 60.,  8., 10.,
-            0.,  1.,  2.,  3.,  4.,
-            0.,  0.,  2.,  4.,  6.,
-            0.,  0.,  0.,  1.,  2.,
-            0.,  0.,  0.,  0.,  4.
-
-    });
-
-    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {
-            0.5, -2.0, -13.0, 54.0, -6.75,
-            0.0,  1.0,  -1.0,  1.0,   0.0,
-            0,    0,   0.5, -2.0,  0.25,
-            0,    0,     0,  1.0,  -0.5,
-            0,    0,     0,    0,  0.25
-
-    });
+    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {2.,  4., 60.,  8., 10., 0.,  1.,  2.,  3.,  4., 0.,  0.,  2.,  4.,  6., 0.,  0.,  0.,  1.,  2., 0.,  0.,  0.,  0.,  4. });
 
+    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {0.5, -2.0, -13.0, 54.0, -6.75, 0.0,  1.0,  -1.0,  1.0,   0.0, 0,    0,   0.5, -2.0,  0.25, 0,    0,     0,  1.0,  -0.5, 0,    0,     0,    0,  0.25 });
     nd4j::ops::matrix_inverse op;
     auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
 
@@ -1680,21 +1650,8 @@ TEST_F(DeclarableOpsTests6, MatrixInverse_01) {
 ////////////////////////////////////////////////////////////////////////////////
 TEST_F(DeclarableOpsTests6, MatrixInverse_02) {
 
-    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {
-            1.,  0.,  0.,  0.,  0.,
-            2.,  1.,  0.,  0.,  0.,
-            30.,  2.,  1.,  0.,  0.,
-            4.,  3.,  2.,  1.,  0.,
-            5.,  4.,  3.,  2.,  1.
-    });
-
-    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {
-            1.0,  0.0,  0.0,  0.0, 0.,
-            -2.0,  1.0,   0.,   0., 0.,
-            -26.0, -2.0,    1,    0, 0.,
-            54.0,  1.0, -2.0,    1, 0.,
-            -27.0,  0.0,  1.0, -2.0, 1.
-    });
+    auto x = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.,  0.,  0.,  0.,  0., 2.,  1.,  0.,  0.,  0., 30.,  2.,  1.,  0.,  0., 4.,  3.,  2.,  1.,  0., 5.,  4.,  3.,  2.,  1. });
+    auto exp = NDArrayFactory::create<float>('c', {1, 5, 5}, {1.0,  0.0,  0.0,  0.0, 0., -2.0,  1.0,   0.,   0., 0., -26.0, -2.0,    1,    0, 0., 54.0,  1.0, -2.0,    1, 0., -27.0,  0.0,  1.0, -2.0, 1. });
 
     nd4j::ops::matrix_inverse op;
     auto result = op.execute({&x}, {}, {}, {}, false, nd4j::DataType::FLOAT32);
diff --git a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
index d0d67000b..21af8e380 100644
--- a/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/JavaInteropTests.cpp
@@ -169,7 +169,7 @@ TEST_F(JavaInteropTests, TestSconv2d_1) {
     auto input = NDArrayFactory::create<float>('c', {3, 3, 8, 8});
     auto weightsD = NDArrayFactory::create<float>('c', {1, 3, 1, 1});
     auto weightsP = NDArrayFactory::create<float>('c', {2, 3, 1, 1});
-    auto bias = NDArrayFactory::create<float>('c', {1, 2});
+    auto bias = NDArrayFactory::create<float>('c', {2});
     auto output = NDArrayFactory::create<float>('c', {3, 2, 8, 8});
     output.assign(0.0);
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
index 32ff23847..9f9937368 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@@ -1259,7 +1259,7 @@ TEST_F(NDArrayTest2, reduce_1) {
             for (int x = 0; x < 4; x++) {
                 for (int y = 0; y < 4; y++) {
                     Nd4jLong indices[] = {0, 0, x, y, i, j};
-                    Nd4jLong offset = shape::getOffset(0, arr6.shapeOf(), arr6.stridesOf(), indices, arr6.rankOf());
+                    Nd4jLong offset = shape::getOffset(arr6.getShapeInfo(), indices);
                     sum += ((double*)arr6.getBuffer())[offset];
                 }
             }
diff --git a/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp b/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp
index 5005808bf..05f823e4a 100644
--- a/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/OneOffTests.cpp
@@ -159,8 +159,6 @@ TEST_F(OneOffTests, test_conv2d_nhwc_failed_1) {
     auto z = graph->getVariableSpace()->getVariable(9)->getNDArray();
     ASSERT_TRUE(z != nullptr);
 
-    // z->printIndexedBuffer("z");
-
     ASSERT_EQ(e, *z);
 
     delete graph;
diff --git a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
index b6981a5c3..0254d1877 100644
--- a/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ParityOpsTests.cpp
@@ -683,7 +683,7 @@ TEST_F(ParityOpsTests, Test_Reshape_TF_1) {
 TEST_F(ParityOpsTests, Test_Bias_Add_1) {
     auto x = NDArrayFactory::create<float>('c', {10, 5});
     x.assign(0.0);
-    auto bias = NDArrayFactory::create<float>('c', {1, 5}, {1, 2, 3, 4, 5});
+    auto bias = NDArrayFactory::create<float>('c', {5}, {1, 2, 3, 4, 5});
     nd4j::ops::biasadd op;
 
     auto result = op.execute({&x, &bias}, {}, {});
@@ -691,7 +691,6 @@ TEST_F(ParityOpsTests, Test_Bias_Add_1) {
 
     auto z = result->at(0);
 
-
     auto tads = z->allTensorsAlongDimension({1});
     for (int e = 0; e < tads->size(); e++) {
         ASSERT_TRUE(bias.equalsTo(tads->at(e)));
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp
index 98b9cd026..071c33fab 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests.cpp
@@ -67,7 +67,7 @@ TEST_F(ShapeTests, Test_ShapeEquality_1) {
     Nd4jLong shape[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102};
     Nd4jLong shape_GOOD[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, 1, 99};
     Nd4jLong shape_BAD[] = {4, 3, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102};
-    
+
 
     ASSERT_TRUE(shape::equalsSoft(shape, shape_GOOD));
     ASSERT_FALSE(shape::equalsSoft(shape, shape_BAD));
@@ -77,7 +77,7 @@ TEST_F(ShapeTests, Test_ShapeEquality_2) {
     Nd4jLong shape[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102};
     Nd4jLong shape_GOOD[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 102};
     Nd4jLong shape_BAD[] = {4, 2, 3, 4, 5, 60, 20, 5, 1, 0, -1, 99};
-    
+
 
     ASSERT_TRUE(shape::equalsStrict(shape, shape_GOOD));
     ASSERT_FALSE(shape::equalsStrict(shape, shape_BAD));
@@ -86,45 +86,24 @@ TEST_F(ShapeTests, Test_ShapeEquality_2) {
 TEST_F(ShapeTests, Test_Ind2SubC_1) {
     Nd4jLong shape[] = {3, 5};
     Nd4jLong c0[2];
-    shape::index2coords(2, shape, 0, c0);
+    shape::index2coords(0, 2, shape, c0);
 
     ASSERT_EQ(0, c0[0]);
     ASSERT_EQ(0, c0[1]);
 
     Nd4jLong c1[2];
-    shape::index2coords(2, shape, 1, c1);
+    shape::index2coords(1, 2, shape, c1);
 
     ASSERT_EQ(0, c1[0]);
     ASSERT_EQ(1, c1[1]);
 
     Nd4jLong c6[2];
-    shape::index2coords(2, shape, 5, c6);
+    shape::index2coords(5, 2, shape, c6);
 
     ASSERT_EQ(1, c6[0]);
-    ASSERT_EQ(0, c6[1]);    
+    ASSERT_EQ(0, c6[1]);
 }
 
-TEST_F(ShapeTests, Test_Ind2Sub_1) {
-    Nd4jLong shape[] = {3, 5};
-    
-    Nd4jLong c0[2];
-    shape::index2coords(2, shape, 0, c0, 'f');
-
-    ASSERT_EQ(0, c0[0]);
-    ASSERT_EQ(0, c0[1]);
-
-    Nd4jLong c1[2];
-    shape::index2coords(2, shape, 1, c1, 'f');
-
-    ASSERT_EQ(1, c1[0]);
-    ASSERT_EQ(0, c1[1]);
-
-    Nd4jLong c6[2];
-    shape::index2coords(2, shape, 5, c6, 'f');
-
-    ASSERT_EQ(2, c6[0]);
-    ASSERT_EQ(1, c6[1]);    
-}
 
 TEST_F(ShapeTests, Test_ShapeDetector_1) {
     Nd4jLong shape[] = {2, 5, 3, 3, 1, 0, 1, 99};