diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp
index 6c5f6a8c8..f7c6d0684 100644
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@@ -4866,7 +4866,7 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
         }
     }
 
-    Nd4jLong *shapeInfoNoUnities = newShapeInfo;
+    Nd4jLong *newShapeInfo2 = newShapeInfo;
 
     if(!keepUnitiesInShape) {
 
@@ -4877,18 +4877,18 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
                 dimsWithUnities.push_back(d);
 
         if(!dimsWithUnities.empty())
-            shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
+            newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
     }
 
     // check if there is possibility to set ews = 1
-    shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities);
+    shape::checkStridesEwsAndOrder(newShapeInfo2);
 
-    NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset());
+    NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset());
     result._isView = true;
 
     RELEASE(newShapeInfo, getContext()->getWorkspace());
-    if(newShapeInfo != shapeInfoNoUnities)
-        RELEASE(shapeInfoNoUnities, getContext()->getWorkspace());
+    if(newShapeInfo != newShapeInfo2)
+        RELEASE(newShapeInfo2, getContext()->getWorkspace());
 
     return result;
 }
diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h
index 3d1d96f4b..d4e95c65f 100644
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@@ -900,9 +900,9 @@ namespace shape {
 * @return the double at the specified index
 */
 
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);
 
     ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);
 
@@ -1014,8 +1014,8 @@ namespace shape {
     // if array is scalar or unit length vector then ews = 1 and order is preserved
     // if array is common vector then ews = stride of non-unity dimension and order is preserved
     // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
-    ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo);
+    ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
+    ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo);
 
     /**
     * processes whole set of sub-arrays
@@ -1041,7 +1041,7 @@ namespace shape {
     ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);
 
     /**
-    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
     * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
     */
     INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
@@ -2071,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
             shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank];
         }
 
-        shape::checkStridesSetEwsAndOrder(shapeInfo);
+        shape::checkStridesEwsAndOrder(shapeInfo);
 
         delete[] temp;
     }
@@ -2483,7 +2483,7 @@ INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape)
         newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer);
 
         // correct order and ews if necessary
-        shape::checkStridesSetEwsAndOrder(newShapeBuffer);
+        shape::checkStridesEwsAndOrder(newShapeBuffer);
 
         delete[] indices;
 
@@ -4092,7 +4092,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
 
     // set ews
     if(oldEws == 0)
-        shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides);  // set ews and order
+        shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides);  // set ews and order
     else {
         newShapeInfo[2 * newRank + 3] = oldOrder;                   // order
         *shape::ews(newShapeInfo) = oldEws;                         // ews
@@ -4642,7 +4642,7 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
+INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) {
 
     // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
     Nd4jLong tempBuffer[2*MAX_RANK];
@@ -4651,11 +4651,11 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
     // exclude unities from shapeInfo
     const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides);
 
-    shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
+    shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
 }
 
 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
+INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
 
     const int rank = shape::rank(shapeInfo);
 
@@ -4673,19 +4673,32 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
 
     bool contiguous = true;
 
-    // *** check whether strides are in c contiguous order ***//
-    if(stridesNoUnities[numOfNonUnities - 1] != 1)    // last stride should be always unity for c order
-        contiguous = false;
-    else {
-        for (uint i = 0; i < numOfNonUnities - 1; ++i) {
-            if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) {
-                contiguous = false;
-                break;
-            }
+    //*** check whether strides are in c contiguous order ***//
+    for (uint i = 0; i < numOfNonUnities - 1; ++i) {
+        if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) {
+            contiguous = false;
+            break;
         }
     }
+
     if(contiguous) {
-        *shape::ews(shapeInfo) = 1;
+
+        // for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4}
+        if(numOfNonUnities < rank) {    // unities are present in shape
+
+            int indNonUnit = rank - 1;
+
+            while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1)
+
+            for(int j = indNonUnit + 2; j < rank; ++j)
+                shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1];
+
+            for(int j = indNonUnit; j >= 0; --j)
+                if(shape::shapeOf(shapeInfo)[j] == 1)
+                    shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1];
+        }
+
+        *shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1];
         shapeInfo[rank * 2 + 3] = 99;
         return;
     }
@@ -4693,18 +4706,31 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
     contiguous = true;
 
     //*** check whether strides are in f contiguous order ***//
-    if(stridesNoUnities[0] != 1)    // first stride should be always unity for f order
-        contiguous = false;
-    else {
-        for (uint i = 1; i < numOfNonUnities; ++i) {
-            if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) {
-                contiguous = false;
-                break;
-            }
+    for (uint i = 1; i < numOfNonUnities; ++i) {
+        if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) {
+            contiguous = false;
+            break;
         }
     }
+
     if(contiguous) {
-        *shape::ews(shapeInfo) = 1;
+
+        // for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4}
+        if(numOfNonUnities < rank) {    // unities are present in shape
+
+            int indNonUnit = 0;
+
+            while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1)
+
+            for(int j = 0; j < indNonUnit - 1; ++j)
+                shape::stride(shapeInfo)[j] = stridesNoUnities[0];
+
+            for(int j = indNonUnit; j < rank; ++j)
+                if(shape::shapeOf(shapeInfo)[j] == 1)
+                    shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1];
+        }
+
+        *shape::ews(shapeInfo) = stridesNoUnities[0];
         shapeInfo[rank * 2 + 3] = 102;
         return;
     }
@@ -4756,7 +4782,7 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
     shape::calcOffsets(dimsSize, shape, strides, subArrOffsets);
 
     // evaluate ews
-    shape::checkStridesSetEwsAndOrder(subArrShapeInfo);
+    shape::checkStridesEwsAndOrder(subArrShapeInfo);
 
     delete []strides;
     delete []shape;
diff --git a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
index 2003eef3f..faa59fa6c 100644
--- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
@@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
     std::vector<int> arrsToDelete;
     int index = 0;
     bool allOfSameType = true;
-    auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
-    auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
+    auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
+    auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
 
     for(int i = 0; i < numOfInArrs; ++i) {
         auto input = INPUT_VARIABLE(i);
@@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
 
 // TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy
 //        REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank);
-//        REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank);
+//        REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr);
         if(!input->isEmpty()) {
 
-            allOfSameType &= (theFirstDatatype == input->dataType());
+            allOfSameType &= (typeOfFirstArr == input->dataType());
 
             if(input->rankOf() == 0) {
                 auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext());
diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
index 43c0e4af9..b455ff659 100644
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@@ -85,38 +85,106 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid
 //////////////////////////////////////////////////////////////////////////
 void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
 
-    const int threadsPerBlock = 256;
-    const int blocksPerGrid = 512;
-    const int sharedMem = 512;
+    const int numOfInArrs = inArrs.size();
+    const auto sizeofT    = output.sizeOfT();
 
-    const int numOfArrs = inArrs.size();
-
-    for(int i = 0; i < numOfArrs; ++i)
+    for(int i = 0; i < numOfInArrs; ++i)
         inArrs[i]->syncToDevice();
-
     output.syncToDevice();
 
-    // prepare arrays of pointers on buffers and shapes
-    std::vector<void*> hInBuffers(numOfArrs);
-    std::vector<Nd4jLong*> hInShapeInfo(numOfArrs);
+    bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
 
-    for(int i = 0; i < numOfArrs; ++i) {
-        hInBuffers[i]   = inArrs[i]->getSpecialBuffer();
-        hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
+    if(luckCase1) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
+            if(!luckCase1)
+                break;
+        }
     }
 
-    PointersManager manager(context, "helpers::concat");
+    if(luckCase1) {     // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
 
-    void* dInBuffers   = manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
-    void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
+        void* z = static_cast<int8_t*>(output.getSpecialBuffer());
 
-    BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT;
+            cudaMemcpyAsync(z, static_cast<int8_t*>(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
+            z = static_cast<int8_t*>(z) + memAmountToCopy;
+        }
 
-    manager.synchronize();
+        if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
+            throw std::runtime_error("concat cuda: luckCase1 failed!");
 
-    for(int i = 0; i < numOfArrs; ++i)
+        for(int i = 0; i < numOfInArrs; ++i)
+            inArrs[i]->tickReadDevice();
+        output.tickWriteDevice();
+
+        return;
+    }
+
+    const bool isZcontin = output.strideAt(axis) == 1;
+    bool areInputsContin = true;
+    bool allSameOrder    = true;
+
+    if(isZcontin) {
+        for (uint i = 0; i < inArrs.size(); ++i) {
+            areInputsContin &= inArrs[i]->strideAt(axis) == 1;
+            allSameOrder    &= output.ordering() == inArrs[i]->ordering();
+            if(!areInputsContin || !allSameOrder)
+                break;
+        }
+    }
+
+    const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
+
+    if(luckCase2) {     // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
+
+        const uint zDim = output.sizeAt(axis);
+
+        for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
+
+            const auto iShift = i * sizeofT;
+            void* z = static_cast<int8_t*>(output.getSpecialBuffer()) + zDim * iShift;
+
+            for (uint j = 0; j < numOfInArrs; ++j) {
+                const auto xDim = inArrs[j]->sizeAt(axis);
+                void* x = static_cast<int8_t*>(inArrs[j]->getSpecialBuffer()) + xDim * iShift;
+                const auto memSizeToCopy = xDim * sizeofT;
+                cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
+                z = static_cast<int8_t*>(z) + memSizeToCopy;
+            }
+        }
+
+        if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
+            throw std::runtime_error("concat cuda: luckCase2 failed!");
+    }
+    else {      // general (slower) case
+
+        const int threadsPerBlock = 256;
+        const int blocksPerGrid = 512;
+        const int sharedMem = 512;
+
+        // prepare arrays of pointers on buffers and shapes
+        std::vector<void*> hInBuffers(numOfInArrs);
+        std::vector<Nd4jLong*> hInShapeInfo(numOfInArrs);
+
+        for(int i = 0; i < numOfInArrs; ++i) {
+            hInBuffers[i]   = inArrs[i]->getSpecialBuffer();
+            hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
+        }
+
+        PointersManager manager(context, "helpers::concat");
+
+        void* dInBuffers   = manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
+        void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
+
+        BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
+
+        manager.synchronize();
+    }
+
+    for(int i = 0; i < numOfInArrs; ++i)
         inArrs[i]->tickReadDevice();
-
     output.tickWriteDevice();
 }
 
diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp
index 030e9c6d7..ad63ee490 100644
--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@@ -31,81 +31,170 @@
 #include <helpers/Loops.h>
 
 namespace nd4j {
-
 /**
 * Concatneate multi array of the same shape together
 * along a particular dimension
 */
+// template <typename T>
+// void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+//         const uint numOfArrs = inArrs.size();
+
+//         int outDim;
+//         const bool isOutputVector = output.isCommonVector(outDim);
+
+//         if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
+
+//             bool allVectorsOrScalars = true;
+//             const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
+
+//             std::vector<int> nonUnityDim(numOfArrs);
+//             std::vector<Nd4jLong> zOffset(numOfArrs);
+
+//             for(int i = 0; i < numOfArrs; i++) {
+//                 allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
+//                 if(!allVectorsOrScalars)
+//                     break;
+//                 if(i == 0)  zOffset[0] = 0;
+//                 else        zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
+//             }
+
+//             if(allVectorsOrScalars) {
+
+//                 T* outBuff = output.bufferAsT<T>();
+
+//                 auto func = PRAGMA_THREADS_FOR {
+//                     for (auto r = start; r < stop; r += increment) {
+//                         const Nd4jLong arrLen = inArrs[r]->lengthOf();
+//                         const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+
+//                         T *z = outBuff + zOffset[r];
+//                         T *x = inArrs[r]->bufferAsT<T>();
+
+//                         if (outEws == 1 && xEws == 1)
+//                             for (Nd4jLong e = 0; e < arrLen; e++)
+//                                 z[e] = x[e];
+//                         else
+//                             for (Nd4jLong e = 0; e < arrLen; e++)
+//                                 z[e * outEws] = x[e * xEws];
+//                     }
+//                 };
+
+//                 samediff::Threads::parallel_tad(func, 0, numOfArrs);
+//                 return;
+//             }
+//         }
+
+//         const int rank  = inArrs[0]->rankOf();
+//         const int rank2 = 2*rank;
+//         std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
+
+//         // take into account indices for first array
+//         indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
+
+//         // loop through the rest of input arrays
+//         for(int i = 1; i < numOfArrs; ++i) {
+//             indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
+//             indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
+//         }
+
+//         auto func = PRAGMA_THREADS_FOR {
+//             for (auto i = start; i < stop; i += increment) {
+//                 auto temp = output(indices[i], true);
+//                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+//             }
+//         };
+
+//         samediff::Threads::parallel_tad(func, 0, numOfArrs);
+// }
+
 template <typename T>
 void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-        const uint numOfArrs = inArrs.size();
 
-        int outDim;
-        const bool isOutputVector = output.isCommonVector(outDim);
+    const int numOfInArrs = inArrs.size();
+    const auto sizeofT    = output.sizeOfT();
 
-        if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
+    T* zBuff = output.bufferAsT<T>();
 
-            bool allVectorsOrScalars = true;
-            const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
+    bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
 
-            std::vector<int> nonUnityDim(numOfArrs);
-            std::vector<Nd4jLong> zOffset(numOfArrs);
+    if(luckCase1) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
+            if(!luckCase1)
+                break;
+        }
+    }
 
-            for(int i = 0; i < numOfArrs; i++) {
-                allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
-                if(!allVectorsOrScalars)
-                    break;
-                if(i == 0)  zOffset[0] = 0;
-                else        zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
-            }
+    if(luckCase1) {     // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
 
-            if(allVectorsOrScalars) {
+        T* z = zBuff;
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            const auto memAmountToCopy = inArrs[i]->lengthOf();
+            memcpy(z, inArrs[i]->bufferAsT<T>(), memAmountToCopy * sizeofT);
+            z += memAmountToCopy;
+        }
+        return;
+    }
 
-                T* outBuff = output.bufferAsT<T>();
+    const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c';
+    bool areInputsContin = true;
+    bool allSameOrder    = true;
 
-                auto func = PRAGMA_THREADS_FOR {
-                    for (auto r = start; r < stop; r++) {
-                        const Nd4jLong arrLen = inArrs[r]->lengthOf();
-                        const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+    if(isZcontin) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            areInputsContin &= inArrs[i]->strideAt(axis) == 1;
+            allSameOrder    &= inArrs[i]->ordering() == output.ordering();
+            if(!areInputsContin || !allSameOrder)
+                break;
+        }
+    }
 
-                        T *z = outBuff + zOffset[r];
-                        T *x = inArrs[r]->bufferAsT<T>();
+    const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
 
-                        if (outEws == 1 && xEws == 1)
-                            for (Nd4jLong e = 0; e < arrLen; e++)
-                                z[e] = x[e];
-                        else
-                            for (Nd4jLong e = 0; e < arrLen; e++)
-                                z[e * outEws] = x[e * xEws];
-                    }
-                };
+    if(luckCase2) {     // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
 
-                samediff::Threads::parallel_tad(func, 0, numOfArrs);
-                return;
+        const uint zDim       = output.sizeAt(axis);
+
+        for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
+            T* z = zBuff + zDim * i;
+
+            for (uint j = 0; j < inArrs.size(); ++j) {
+                const auto xDim = inArrs[j]->sizeAt(axis);
+                const T* x = inArrs[j]->bufferAsT<T>() + xDim * i;
+                memcpy(z, x, xDim * sizeofT);
+                z += xDim;
             }
         }
 
-        const int rank  = inArrs[0]->rankOf();
-        const int rank2 = 2*rank;
-        std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
+        return;
+    }
 
-        // take into account indices for first array
-        indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
+    // general case
+    auto func = PRAGMA_THREADS_FOR {
 
-        // loop through the rest of input arrays
-        for(int i = 1; i < numOfArrs; ++i) {
-            indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
-            indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
-        }
+        Nd4jLong coords[MAX_RANK];
 
-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i++) {
-                auto temp = output(indices[i], true);
-                nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+        for (auto i = start; i < stop; i += increment) {
+
+            shape::index2coords(i, output.getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+
+            uint inArrIdx = 0;
+            uint xDim = inArrs[inArrIdx]->sizeAt(axis);
+
+            while (coords[axis] >= xDim) {
+                coords[axis] -= xDim;
+                xDim = inArrs[++inArrIdx]->sizeAt(axis);
             }
-        };
 
-        samediff::Threads::parallel_tad(func, 0, numOfArrs);
+            const T* x = inArrs[inArrIdx]->bufferAsT<T>();
+            const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
+
+            zBuff[zOffset] = x[xOffset];
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, output.lengthOf());
 }
 
 /**
@@ -128,6 +217,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
         delete inputs[i];
 }
 
+
 /**
  * This kernel accumulates X arrays, and stores result into Z
  *
diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
index 77634b052..773e1dc18 100644
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
     ASSERT_EQ(ND4J_STATUS_OK, result->status());
     auto output = result->at(0);
 
+    output->printBuffer();
+
     ASSERT_TRUE(exp.isSameShape(output));
     ASSERT_TRUE(exp.equalsTo(output));
 
@@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
 
     // we crate bunch of arrays, filled with specific values
     for (int e = 0; e < 2000; e++) {
-        auto array = NDArrayFactory::create_<float>('c', {1, 300});
+        auto array = NDArrayFactory::create_<int>('c', {1, 300});
         array->assign(e);
         context.setInputArray(e, array, true);
     }
 
-    auto z = NDArrayFactory::create<float>('c', {2000, 300});
+    auto z = NDArrayFactory::create<int>('c', {2000, 300});
     context.setOutputArray(0, &z, false);
     context.setIArguments(&axis, 1);
 
@@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
     op.execute(&context);
 
     for (int e = 0; e < 2000; e++) {
+        auto exp = NDArrayFactory::create<int>('c', {300});
+        exp.assign(e);
         auto row = z.tensorAlongDimension(e, {1});
-        ASSERT_NEAR((float) e, row.e<float>(0), 1e-5f);
+        ASSERT_EQ(exp, row);
     }
 }
 
diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
index e3dc1aefc..6d5366396 100644
--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) {
     float    buffExpX3[]  = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
     Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99};
     float    buffExpX4[]  = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
-    Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99};
+    Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99};
     float    buffExpX5[]  = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000};
 
     Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102};
diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
index a8f430fe3..fb0d7991a 100644
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
@@ -43,7 +43,7 @@ public:
     Nd4jLong shape[3] = {3,4,5};
     Nd4jLong *shapeBuffer;
     ThreeDTest() {
-        shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);        
+        shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
     }
     ~ThreeDTest() {
         delete[] shapeBuffer;
@@ -196,11 +196,11 @@ public:
     int dimensionLength = 2;
     int dimension[2] = {2,3};
     Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99};
-    Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102};
+    Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102};
 };
 
 
-TEST_F(LeadingOnes,OnesTest) {        
+TEST_F(LeadingOnes,OnesTest) {
 
     shape::TAD *cTad = new shape::TAD;
     cTad->init(shapeBufferC,dimension,dimensionLength);
@@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) {
 
 class NormalThreeFourFive : public testing::Test {
 public:
-    Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102};
+    Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99};
     Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99};
     int dimensionLength = 2;
     int dimension[2] = {0,1};
@@ -243,7 +243,7 @@ class DimensionWarning : public testing::Test {
 public:
     int dimensionLength = 2;
     int dimensions[2] = {0,1};
-    Nd4jLong shape[3] = {1,5,1};    
+    Nd4jLong shape[3] = {1,5,1};
     Nd4jLong *shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
 
     ~DimensionWarning() {
@@ -324,7 +324,7 @@ public:
     int dimensionFour = 0;
     int dimensionLength = 1;
     FourDTest() {
-        threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape);        
+        threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape);
         fourDShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 4, fourDShape);
     }
     ~FourDTest() {
@@ -491,7 +491,7 @@ TEST_F(LabelTest,LabelTad) {
     delete tad;
 }
 
-TEST_F(ExpectedValuesTest,TadTest) {    
+TEST_F(ExpectedValuesTest,TadTest) {
     auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, mainShape);
     shape::TAD *tad = new shape::TAD;
     tad->init(shapeBuffer,testDimensions,3);
@@ -528,7 +528,7 @@ TEST_F(ThreeDTest,TensorAlongDimensionTest) {
 }
 
 
-TEST_F(NumTadTests,TadTest) {    
+TEST_F(NumTadTests,TadTest) {
     auto shape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, this->shape);
     shape::TAD *tad = new shape::TAD;
     tad->init(shape,&dimension,1);
@@ -539,7 +539,7 @@ TEST_F(NumTadTests,TadTest) {
 }
 
 TEST_F(TADStall,TestStall) {
-    auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);    
+    auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);
     shape::TAD *tad = new shape::TAD;
     tad->init(0,shapeInfo,this->dimensions,3);
     tad->createTadOnlyShapeInfo();
@@ -564,7 +564,7 @@ TEST_F(PermuteTest,PermuteShapeBufferTest) {
     Nd4jLong shapeToPermute[4] = {5,3,2,6};
     Nd4jLong permutedOrder[4] = {6,2,3,5};
     auto shapeBufferOriginal  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);
-    auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);    
+    auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute);
     shape::permuteShapeBufferInPlace(shapeBufferOriginal,normalOrder,shapeBufferOriginal);
     EXPECT_TRUE(arrsEquals(4,assertionShapeBuffer,shapeBufferOriginal));
 
@@ -585,9 +585,9 @@ TEST_F(ElementWiseStrideTest,ElementWiseStrideTest) {
 
 TEST_F(SliceVectorTest,RowColumnVectorTest) {
     Nd4jLong rowVectorShape[2] = {1,5};
-    auto rowVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);    
+    auto rowVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);
     Nd4jLong colVectorShape[2] = {5,1};
-    auto colVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape);    
+    auto colVectorShapeInfo  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape);
     Nd4jLong *sliceRow = shape::sliceOfShapeBuffer(0,rowVectorShapeInfo);
     EXPECT_TRUE(arrsEquals(2,rowVectorShapeInfo,sliceRow));
     Nd4jLong *scalarSliceInfo = shape::createScalarShapeInfo();
@@ -608,7 +608,7 @@ TEST_F(SliceTensorTest,TestSlice) {
     Nd4jLong shape[3] = {3,3,2};
     auto shapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
     Nd4jLong sliceShape[2] = {3,2};
-    auto sliceShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);    
+    auto sliceShapeBuffer  = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);
     Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer);
     EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice));
     delete[] testSlice;
@@ -619,9 +619,9 @@ TEST_F(SliceTensorTest,TestSlice) {
 
 TEST_F(SliceMatrixTest,TestSlice) {
     Nd4jLong shape[2] = {3,2};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
     Nd4jLong sliceShape[2] = {1,2};
-    auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);    
+    auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape);
     Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer);
     EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice));
     delete[] testSlice;
@@ -664,13 +664,13 @@ TEST_F(TensorTwoFromFourDDimTest,TadTwoFromFourDimTest) {
     //Along dimension 1,2: expect matrix with shape [cols,dim2]
     //Along dimension 1,3: expect matrix with shape [cols,dim3]
     //Along dimension 2,3: expect matrix with shape [dim2,dim3]
-    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);    
+    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape);
     for(int i = 0; i <  3; i++) {
         int *dimArr = dims[i];
         Nd4jLong *expectedShape = expectedShapes[i];
         shape::TAD *tad = new shape::TAD;
         tad->init(baseShapeBuffer,dimArr,dimensionLength);
-        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);        
+        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
         tad->createTadOnlyShapeInfo();
         Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo;
         EXPECT_TRUE(arrsEquals(shape::rank(expectedShapeBuffer),expectedShape,shape::shapeOf(testShapeBuffer)));
@@ -687,14 +687,14 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) {
     //Along dimension 0,1: expect matrix with shape [rows,cols]
     //Along dimension 0,2: expect matrix with shape [rows,dim2]
     //Along dimension 1,2: expect matrix with shape [cols,dim2]
-    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);    
+    auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape);
 
     for(int i = 0; i <  3; i++) {
         int *dimArr = dims[i];
         Nd4jLong *expectedShape = expectedShapes[i];
         shape::TAD *tad = new shape::TAD;
         tad->init(baseShapeBuffer,dimArr,dimensionLength);
-        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);        
+        auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape);
         tad->createTadOnlyShapeInfo();
         Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo;
         Nd4jLong *expectedStride = expectedStrides[i];
@@ -715,7 +715,7 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) {
 
 TEST_F(TensorOneDimTest,TadDimensionsForTensor) {
     Nd4jLong shape[3] = {rows,cols,dim2};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);
 
     for(int i = 0; i < rank; i++) {
         //Along dimension 0: expect row vector with length 'dims[i]'
@@ -737,14 +737,14 @@ TEST_F(TensorOneDimTest,TadDimensionsForTensor) {
 
 TEST_F(MatrixTest,TadDimensionsForMatrix) {
     Nd4jLong shape[2] = {rows,cols};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape);
 
     shape::TAD *dimZero = new shape::TAD;
     dimZero->init(shapeBuffer,&dims[0],1);
     shape::TAD *dimOne = new shape::TAD;
     dimOne->init(shapeBuffer,&dims[1],1);
     //Along dimension 0: expect row vector with length 'rows'
-    Nd4jLong rowVectorShape[2] = {1,rows};    
+    Nd4jLong rowVectorShape[2] = {1,rows};
     auto expectedDimZeroShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape);
     dimZero->createTadOnlyShapeInfo();
     Nd4jLong *testDimZero = dimZero->tadOnlyShapeInfo;
@@ -753,7 +753,7 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
 
     delete[] expectedDimZeroShape;
     //Along dimension 1: expect row vector with length 'cols'
-    Nd4jLong rowVectorColShape[2] {1,cols};    
+    Nd4jLong rowVectorColShape[2] {1,cols};
     auto expectedDimOneShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorColShape);
     dimOne->createTadOnlyShapeInfo();
     Nd4jLong *testDimOneShape = dimOne->tadOnlyShapeInfo;
@@ -767,12 +767,12 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) {
 }
 
 TEST_F(VectorTest,VectorTadShape) {
-    Nd4jLong rowVector[2] = {2,2};    
+    Nd4jLong rowVector[2] = {2,2};
     auto rowBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVector);
     int rowDimension = 1;
 
     Nd4jLong columnVector[2] = {2,2};
-    auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector);   
+    auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector);
     int colDimension = 0;
 
 
@@ -811,7 +811,7 @@ TEST_F(VectorTest,LinspaceCombinationTest) {
     int len = rows * cols;
     double *linspaced = linspace<double>(1,rows * cols,len);
     Nd4jLong shape[2] = {rows,cols};
-    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);    
+    auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape);
 
     delete[] shapeBuffer;
     delete[] linspaced;
diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
index c8b15c1a2..db2c941e9 100644
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@@ -7742,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432;
 * @return the double at the specified index
 */
 
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords);
 
     @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank);
     @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank);
diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
index 90e9015b1..bad97296f 100644
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
@@ -16,6 +16,7 @@
 
 package org.nd4j.linalg.shape.concat;
 
+import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue;
 /**
  * @author Adam Gibson
  */
+@Slf4j
 @RunWith(Parameterized.class)
 public class ConcatTestsC extends BaseNd4jTest {
 
@@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest {
         for (int e = 0; e < 20000; e++)
             list.add(Nd4j.create(DataType.INT, 1, 300).assign(e));
 
+        val timeStart = System.nanoTime();
         val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()]));
+        val timeEnd = System.nanoTime();
+
+        log.info("Time: {} us", (timeEnd - timeStart) / 1000);
 
         for (int e = 0; e < 20000; e++)
             assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f);