profiling of concat op (both cuda and cpu) (#151)

* - profiling of concat op (both cuda and cpu) Signed-off-by: Yurii <iuriish@yahoo.com> * better comparison for large concat Signed-off-by: raver119 <raver119@gmail.com> * - further improving of concat op Signed-off-by: Yurii <iuriish@yahoo.com> * some loggin Signed-off-by: raver119 <raver119@gmail.com> * - add possibility to verify presence of trailing unities in shape and set strides/ews correspondingly - restrict second simple case in concat op to c order only Signed-off-by: Yurii <iuriish@yahoo.com> * - move concat op to specials_single.cpp file Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of second concat op declaration in transforms.cpp file Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com>
2020-02-20 20:19:01 +02:00 · 2020-02-20 20:19:01 +02:00 · f7a9190407
commit f7a9190407
parent 215641ea9e
10 changed files with 348 additions and 154 deletions
--- a/libnd4j/blas/NDArray.hpp
+++ b/libnd4j/blas/NDArray.hpp
@ -4866,7 +4866,7 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
        }
    }

-    Nd4jLong *shapeInfoNoUnities = newShapeInfo;
+    Nd4jLong *newShapeInfo2 = newShapeInfo;

    if(!keepUnitiesInShape) {

@ -4877,18 +4877,18 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
                dimsWithUnities.push_back(d);

        if(!dimsWithUnities.empty())
-            shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
+            newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
    }

    // check if there is possibility to set ews = 1
-    shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities);
+    shape::checkStridesEwsAndOrder(newShapeInfo2);

-    NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset());
+    NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset());
    result._isView = true;

    RELEASE(newShapeInfo, getContext()->getWorkspace());
-    if(newShapeInfo != shapeInfoNoUnities)
-        RELEASE(shapeInfoNoUnities, getContext()->getWorkspace());
+    if(newShapeInfo != newShapeInfo2)
+        RELEASE(newShapeInfo2, getContext()->getWorkspace());

    return result;
 }
--- a/libnd4j/include/helpers/shape.h
+++ b/libnd4j/include/helpers/shape.h
@ -900,9 +900,9 @@ namespace shape {
 * @return the double at the specified index
 */

-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0);
-    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
+    ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);

    ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);

@ -1014,8 +1014,8 @@ namespace shape {
    // if array is scalar or unit length vector then ews = 1 and order is preserved
    // if array is common vector then ews = stride of non-unity dimension and order is preserved
    // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
-    ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
-    ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo);
+    ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
+    ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo);

    /**
    * processes whole set of sub-arrays
@ -1041,7 +1041,7 @@ namespace shape {
    ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);

    /**
-    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
+    * for example inShapeInfo is {3, 2,1,3,1,4,  12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
    * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
    */
    INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
@ -2071,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
            shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank];
        }

-        shape::checkStridesSetEwsAndOrder(shapeInfo);
+        shape::checkStridesEwsAndOrder(shapeInfo);

        delete[] temp;
    }
@ -2483,7 +2483,7 @@ INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape)
        newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer);

        // correct order and ews if necessary
-        shape::checkStridesSetEwsAndOrder(newShapeBuffer);
+        shape::checkStridesEwsAndOrder(newShapeBuffer);

        delete[] indices;

@ -4092,7 +4092,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap

    // set ews
    if(oldEws == 0)
-        shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides);  // set ews and order
+        shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides);  // set ews and order
    else {
        newShapeInfo[2 * newRank + 3] = oldOrder;                   // order
        *shape::ews(newShapeInfo) = oldEws;                         // ews
@ -4642,7 +4642,7 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong
 }

 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
+INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) {

    // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
    Nd4jLong tempBuffer[2*MAX_RANK];
@ -4651,11 +4651,11 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
    // exclude unities from shapeInfo
    const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides);

-    shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
+    shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
 }

 //////////////////////////////////////////////////////////////////////
-INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
+INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {

    const int rank = shape::rank(shapeInfo);

@ -4673,19 +4673,32 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch

    bool contiguous = true;

-    // *** check whether strides are in c contiguous order ***//
-    if(stridesNoUnities[numOfNonUnities - 1] != 1)    // last stride should be always unity for c order
-        contiguous = false;
-    else {
-        for (uint i = 0; i < numOfNonUnities - 1; ++i) {
-            if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) {
-                contiguous = false;
-                break;
-            }
+    //*** check whether strides are in c contiguous order ***//
+    for (uint i = 0; i < numOfNonUnities - 1; ++i) {
+        if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) {
+            contiguous = false;
+            break;
        }
    }
+
    if(contiguous) {
-        *shape::ews(shapeInfo) = 1;
+
+        // for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4}
+        if(numOfNonUnities < rank) {    // unities are present in shape
+
+            int indNonUnit = rank - 1;
+
+            while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1)
+
+            for(int j = indNonUnit + 2; j < rank; ++j)
+                shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1];
+
+            for(int j = indNonUnit; j >= 0; --j)
+                if(shape::shapeOf(shapeInfo)[j] == 1)
+                    shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1];
+        }
+
+        *shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1];
        shapeInfo[rank * 2 + 3] = 99;
        return;
    }
@ -4693,18 +4706,31 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
    contiguous = true;

    //*** check whether strides are in f contiguous order ***//
-    if(stridesNoUnities[0] != 1)    // first stride should be always unity for f order
-        contiguous = false;
-    else {
-        for (uint i = 1; i < numOfNonUnities; ++i) {
-            if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) {
-                contiguous = false;
-                break;
-            }
+    for (uint i = 1; i < numOfNonUnities; ++i) {
+        if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) {
+            contiguous = false;
+            break;
        }
    }
+
    if(contiguous) {
-        *shape::ews(shapeInfo) = 1;
+
+        // for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4}
+        if(numOfNonUnities < rank) {    // unities are present in shape
+
+            int indNonUnit = 0;
+
+            while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1)
+
+            for(int j = 0; j < indNonUnit - 1; ++j)
+                shape::stride(shapeInfo)[j] = stridesNoUnities[0];
+
+            for(int j = indNonUnit; j < rank; ++j)
+                if(shape::shapeOf(shapeInfo)[j] == 1)
+                    shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1];
+        }
+
+        *shape::ews(shapeInfo) = stridesNoUnities[0];
        shapeInfo[rank * 2 + 3] = 102;
        return;
    }
@ -4756,7 +4782,7 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
    shape::calcOffsets(dimsSize, shape, strides, subArrOffsets);

    // evaluate ews
-    shape::checkStridesSetEwsAndOrder(subArrShapeInfo);
+    shape::checkStridesEwsAndOrder(subArrShapeInfo);

    delete []strides;
    delete []shape;
--- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
+++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp
@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
    std::vector<int> arrsToDelete;
    int index = 0;
    bool allOfSameType = true;
-    auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
-    auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
+    auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
+    auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();

    for(int i = 0; i < numOfInArrs; ++i) {
        auto input = INPUT_VARIABLE(i);
@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {

 // TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy
 //        REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank);
-//        REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank);
+//        REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr);
        if(!input->isEmpty()) {

-            allOfSameType &= (theFirstDatatype == input->dataType());
+            allOfSameType &= (typeOfFirstArr == input->dataType());

            if(input->rankOf() == 0) {
                auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext());
--- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
+++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu
@ -85,38 +85,106 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid
 //////////////////////////////////////////////////////////////////////////
 void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {

-    const int threadsPerBlock = 256;
-    const int blocksPerGrid = 512;
-    const int sharedMem = 512;
+    const int numOfInArrs = inArrs.size();
+    const auto sizeofT    = output.sizeOfT();

-    const int numOfArrs = inArrs.size();
-
-    for(int i = 0; i < numOfArrs; ++i)
+    for(int i = 0; i < numOfInArrs; ++i)
        inArrs[i]->syncToDevice();
-
    output.syncToDevice();

-    // prepare arrays of pointers on buffers and shapes
-    std::vector<void*> hInBuffers(numOfArrs);
-    std::vector<Nd4jLong*> hInShapeInfo(numOfArrs);
+    bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;

-    for(int i = 0; i < numOfArrs; ++i) {
-        hInBuffers[i]   = inArrs[i]->getSpecialBuffer();
-        hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
+    if(luckCase1) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
+            if(!luckCase1)
+                break;
+        }
    }

-    PointersManager manager(context, "helpers::concat");
+    if(luckCase1) {     // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f

-    void* dInBuffers   = manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
-    void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
+        void* z = static_cast<int8_t*>(output.getSpecialBuffer());

-    BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT;
+            cudaMemcpyAsync(z, static_cast<int8_t*>(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
+            z = static_cast<int8_t*>(z) + memAmountToCopy;
+        }

-    manager.synchronize();
+        if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
+            throw std::runtime_error("concat cuda: luckCase1 failed!");

-    for(int i = 0; i < numOfArrs; ++i)
+        for(int i = 0; i < numOfInArrs; ++i)
+            inArrs[i]->tickReadDevice();
+        output.tickWriteDevice();
+
+        return;
+    }
+
+    const bool isZcontin = output.strideAt(axis) == 1;
+    bool areInputsContin = true;
+    bool allSameOrder    = true;
+
+    if(isZcontin) {
+        for (uint i = 0; i < inArrs.size(); ++i) {
+            areInputsContin &= inArrs[i]->strideAt(axis) == 1;
+            allSameOrder    &= output.ordering() == inArrs[i]->ordering();
+            if(!areInputsContin || !allSameOrder)
+                break;
+        }
+    }
+
+    const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
+
+    if(luckCase2) {     // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
+
+        const uint zDim = output.sizeAt(axis);
+
+        for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
+
+            const auto iShift = i * sizeofT;
+            void* z = static_cast<int8_t*>(output.getSpecialBuffer()) + zDim * iShift;
+
+            for (uint j = 0; j < numOfInArrs; ++j) {
+                const auto xDim = inArrs[j]->sizeAt(axis);
+                void* x = static_cast<int8_t*>(inArrs[j]->getSpecialBuffer()) + xDim * iShift;
+                const auto memSizeToCopy = xDim * sizeofT;
+                cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
+                z = static_cast<int8_t*>(z) + memSizeToCopy;
+            }
+        }
+
+        if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
+            throw std::runtime_error("concat cuda: luckCase2 failed!");
+    }
+    else {      // general (slower) case
+
+        const int threadsPerBlock = 256;
+        const int blocksPerGrid = 512;
+        const int sharedMem = 512;
+
+        // prepare arrays of pointers on buffers and shapes
+        std::vector<void*> hInBuffers(numOfInArrs);
+        std::vector<Nd4jLong*> hInShapeInfo(numOfInArrs);
+
+        for(int i = 0; i < numOfInArrs; ++i) {
+            hInBuffers[i]   = inArrs[i]->getSpecialBuffer();
+            hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
+        }
+
+        PointersManager manager(context, "helpers::concat");
+
+        void* dInBuffers   = manager.replicatePointer(hInBuffers.data(),    hInBuffers.size() * sizeof(void*));
+        void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(),  hInShapeInfo.size() * sizeof(Nd4jLong*));
+
+        BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
+
+        manager.synchronize();
+    }
+
+    for(int i = 0; i < numOfInArrs; ++i)
        inArrs[i]->tickReadDevice();
-
    output.tickWriteDevice();
 }

--- a/libnd4j/include/ops/impl/specials_single.hpp
+++ b/libnd4j/include/ops/impl/specials_single.hpp
@ -31,81 +31,170 @@
 #include <helpers/Loops.h>

 namespace nd4j {
-
 /**
 * Concatneate multi array of the same shape together
 * along a particular dimension
 */
+// template <typename T>
+// void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
+//         const uint numOfArrs = inArrs.size();
+
+//         int outDim;
+//         const bool isOutputVector = output.isCommonVector(outDim);
+
+//         if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
+
+//             bool allVectorsOrScalars = true;
+//             const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
+
+//             std::vector<int> nonUnityDim(numOfArrs);
+//             std::vector<Nd4jLong> zOffset(numOfArrs);
+
+//             for(int i = 0; i < numOfArrs; i++) {
+//                 allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
+//                 if(!allVectorsOrScalars)
+//                     break;
+//                 if(i == 0)  zOffset[0] = 0;
+//                 else        zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
+//             }
+
+//             if(allVectorsOrScalars) {
+
+//                 T* outBuff = output.bufferAsT<T>();
+
+//                 auto func = PRAGMA_THREADS_FOR {
+//                     for (auto r = start; r < stop; r += increment) {
+//                         const Nd4jLong arrLen = inArrs[r]->lengthOf();
+//                         const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+
+//                         T *z = outBuff + zOffset[r];
+//                         T *x = inArrs[r]->bufferAsT<T>();
+
+//                         if (outEws == 1 && xEws == 1)
+//                             for (Nd4jLong e = 0; e < arrLen; e++)
+//                                 z[e] = x[e];
+//                         else
+//                             for (Nd4jLong e = 0; e < arrLen; e++)
+//                                 z[e * outEws] = x[e * xEws];
+//                     }
+//                 };
+
+//                 samediff::Threads::parallel_tad(func, 0, numOfArrs);
+//                 return;
+//             }
+//         }
+
+//         const int rank  = inArrs[0]->rankOf();
+//         const int rank2 = 2*rank;
+//         std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
+
+//         // take into account indices for first array
+//         indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
+
+//         // loop through the rest of input arrays
+//         for(int i = 1; i < numOfArrs; ++i) {
+//             indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
+//             indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
+//         }
+
+//         auto func = PRAGMA_THREADS_FOR {
+//             for (auto i = start; i < stop; i += increment) {
+//                 auto temp = output(indices[i], true);
+//                 nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+//             }
+//         };
+
+//         samediff::Threads::parallel_tad(func, 0, numOfArrs);
+// }
+
 template <typename T>
 void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
-        const uint numOfArrs = inArrs.size();

-        int outDim;
-        const bool isOutputVector = output.isCommonVector(outDim);
+    const int numOfInArrs = inArrs.size();
+    const auto sizeofT    = output.sizeOfT();

-        if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
+    T* zBuff = output.bufferAsT<T>();

-            bool allVectorsOrScalars = true;
-            const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
+    bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;

-            std::vector<int> nonUnityDim(numOfArrs);
-            std::vector<Nd4jLong> zOffset(numOfArrs);
+    if(luckCase1) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
+            if(!luckCase1)
+                break;
+        }
+    }

-            for(int i = 0; i < numOfArrs; i++) {
-                allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
-                if(!allVectorsOrScalars)
-                    break;
-                if(i == 0)  zOffset[0] = 0;
-                else        zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
-            }
+    if(luckCase1) {     // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f

-            if(allVectorsOrScalars) {
+        T* z = zBuff;
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            const auto memAmountToCopy = inArrs[i]->lengthOf();
+            memcpy(z, inArrs[i]->bufferAsT<T>(), memAmountToCopy * sizeofT);
+            z += memAmountToCopy;
+        }
+        return;
+    }

-                T* outBuff = output.bufferAsT<T>();
+    const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c';
+    bool areInputsContin = true;
+    bool allSameOrder    = true;

-                auto func = PRAGMA_THREADS_FOR {
-                    for (auto r = start; r < stop; r++) {
-                        const Nd4jLong arrLen = inArrs[r]->lengthOf();
-                        const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
+    if(isZcontin) {
+        for (uint i = 0; i < numOfInArrs; ++i) {
+            areInputsContin &= inArrs[i]->strideAt(axis) == 1;
+            allSameOrder    &= inArrs[i]->ordering() == output.ordering();
+            if(!areInputsContin || !allSameOrder)
+                break;
+        }
+    }

-                        T *z = outBuff + zOffset[r];
-                        T *x = inArrs[r]->bufferAsT<T>();
+    const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;

-                        if (outEws == 1 && xEws == 1)
-                            for (Nd4jLong e = 0; e < arrLen; e++)
-                                z[e] = x[e];
-                        else
-                            for (Nd4jLong e = 0; e < arrLen; e++)
-                                z[e * outEws] = x[e * xEws];
-                    }
-                };
+    if(luckCase2) {     // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array

-                samediff::Threads::parallel_tad(func, 0, numOfArrs);
-                return;
+        const uint zDim       = output.sizeAt(axis);
+
+        for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
+            T* z = zBuff + zDim * i;
+
+            for (uint j = 0; j < inArrs.size(); ++j) {
+                const auto xDim = inArrs[j]->sizeAt(axis);
+                const T* x = inArrs[j]->bufferAsT<T>() + xDim * i;
+                memcpy(z, x, xDim * sizeofT);
+                z += xDim;
            }
        }

-        const int rank  = inArrs[0]->rankOf();
-        const int rank2 = 2*rank;
-        std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
+        return;
+    }

-        // take into account indices for first array
-        indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
+    // general case
+    auto func = PRAGMA_THREADS_FOR {

-        // loop through the rest of input arrays
-        for(int i = 1; i < numOfArrs; ++i) {
-            indices[i][2 * axis]     = indices[i-1][2 * axis + 1];                                // index start from
-            indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis);      // index end with (excluding)
-        }
+        Nd4jLong coords[MAX_RANK];

-        auto func = PRAGMA_THREADS_FOR {
-            for (auto i = start; i < stop; i++) {
-                auto temp = output(indices[i], true);
-                nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
+        for (auto i = start; i < stop; i += increment) {
+
+            shape::index2coords(i, output.getShapeInfo(), coords);
+            const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
+
+            uint inArrIdx = 0;
+            uint xDim = inArrs[inArrIdx]->sizeAt(axis);
+
+            while (coords[axis] >= xDim) {
+                coords[axis] -= xDim;
+                xDim = inArrs[++inArrIdx]->sizeAt(axis);
            }
-        };

-        samediff::Threads::parallel_tad(func, 0, numOfArrs);
+            const T* x = inArrs[inArrIdx]->bufferAsT<T>();
+            const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
+
+            zBuff[zOffset] = x[xOffset];
+        }
+    };
+
+    samediff::Threads::parallel_for(func, 0, output.lengthOf());
 }

 /**
@ -128,6 +217,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
        delete inputs[i];
 }

+
 /**
 * This kernel accumulates X arrays, and stores result into Z
 *
--- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
+++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp
@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
    ASSERT_EQ(ND4J_STATUS_OK, result->status());
    auto output = result->at(0);

+    output->printBuffer();
+
    ASSERT_TRUE(exp.isSameShape(output));
    ASSERT_TRUE(exp.equalsTo(output));

@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) {

    // we crate bunch of arrays, filled with specific values
    for (int e = 0; e < 2000; e++) {
-        auto array = NDArrayFactory::create_<float>('c', {1, 300});
+        auto array = NDArrayFactory::create_<int>('c', {1, 300});
        array->assign(e);
        context.setInputArray(e, array, true);
    }

-    auto z = NDArrayFactory::create<float>('c', {2000, 300});
+    auto z = NDArrayFactory::create<int>('c', {2000, 300});
    context.setOutputArray(0, &z, false);
    context.setIArguments(&axis, 1);

@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
    op.execute(&context);

    for (int e = 0; e < 2000; e++) {
+        auto exp = NDArrayFactory::create<int>('c', {300});
+        exp.assign(e);
        auto row = z.tensorAlongDimension(e, {1});
-        ASSERT_NEAR((float) e, row.e<float>(0), 1e-5f);
+        ASSERT_EQ(exp, row);
    }
 }

--- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp
@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) {
    float    buffExpX3[]  = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
    Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99};
    float    buffExpX4[]  = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
-    Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99};
+    Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99};
    float    buffExpX5[]  = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000};

    Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102};
--- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
+++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp
@ -196,7 +196,7 @@ public:
    int dimensionLength = 2;
    int dimension[2] = {2,3};
    Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99};
-    Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102};
+    Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102};
 };


@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) {

 class NormalThreeFourFive : public testing::Test {
 public:
-    Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102};
+    Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99};
    Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99};
    int dimensionLength = 2;
    int dimension[2] = {0,1};
--- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
+++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java
@ -7742,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432;
 * @return the double at the specified index
 */

-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
-    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
+    @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords);

    @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank);
    @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank);
--- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
+++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java
@ -16,6 +16,7 @@

 package org.nd4j.linalg.shape.concat;

+import lombok.extern.slf4j.Slf4j;
 import lombok.val;
 import org.junit.Ignore;
 import org.junit.Test;
@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue;
 /**
 * @author Adam Gibson
 */
+@Slf4j
@RunWith(Parameterized.class)
 public class ConcatTestsC extends BaseNd4jTest {

@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest {
        for (int e = 0; e < 20000; e++)
            list.add(Nd4j.create(DataType.INT, 1, 300).assign(e));

+        val timeStart = System.nanoTime();
        val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()]));
+        val timeEnd = System.nanoTime();
+
+        log.info("Time: {} us", (timeEnd - timeStart) / 1000);

        for (int e = 0; e < 20000; e++)
            assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f);