diff --git a/libnd4j/blas/NDArray.hpp b/libnd4j/blas/NDArray.hpp index 6c5f6a8c8..f7c6d0684 100644 --- a/libnd4j/blas/NDArray.hpp +++ b/libnd4j/blas/NDArray.hpp @@ -4866,7 +4866,7 @@ NDArray NDArray::operator()(const std::vector& idx, const bool keepUni } } - Nd4jLong *shapeInfoNoUnities = newShapeInfo; + Nd4jLong *newShapeInfo2 = newShapeInfo; if(!keepUnitiesInShape) { @@ -4877,18 +4877,18 @@ NDArray NDArray::operator()(const std::vector& idx, const bool keepUni dimsWithUnities.push_back(d); if(!dimsWithUnities.empty()) - shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace()); + newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace()); } // check if there is possibility to set ews = 1 - shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities); + shape::checkStridesEwsAndOrder(newShapeInfo2); - NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset()); + NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset()); result._isView = true; RELEASE(newShapeInfo, getContext()->getWorkspace()); - if(newShapeInfo != shapeInfoNoUnities) - RELEASE(shapeInfoNoUnities, getContext()->getWorkspace()); + if(newShapeInfo != newShapeInfo2) + RELEASE(newShapeInfo2, getContext()->getWorkspace()); return result; } diff --git a/libnd4j/include/helpers/shape.h b/libnd4j/include/helpers/shape.h index 3d1d96f4b..d4e95c65f 100644 --- a/libnd4j/include/helpers/shape.h +++ b/libnd4j/include/helpers/shape.h @@ -900,9 +900,9 @@ namespace shape { * @return the double at the specified index */ - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0); - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0); - ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0); + ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0); ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank); @@ -1014,8 +1014,8 @@ namespace shape { // if array is scalar or unit length vector then ews = 1 and order is preserved // if array is common vector then ews = stride of non-unity dimension and order is preserved // if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved - ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities); - ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo); + ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities); + ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo); /** * processes whole set of sub-arrays @@ -1041,7 +1041,7 @@ namespace shape { ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities); /** - * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2 + * for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2 * then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99} */ INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo); @@ -2071,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank]; } - shape::checkStridesSetEwsAndOrder(shapeInfo); + shape::checkStridesEwsAndOrder(shapeInfo); delete[] temp; } @@ -2483,7 +2483,7 @@ INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape) newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer); // correct order and ews if necessary - shape::checkStridesSetEwsAndOrder(newShapeBuffer); + shape::checkStridesEwsAndOrder(newShapeBuffer); delete[] indices; @@ -4092,7 +4092,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap // set ews if(oldEws == 0) - shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order + shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order else { newShapeInfo[2 * newRank + 3] = oldOrder; // order *shape::ews(newShapeInfo) = oldEws; // ews @@ -4642,7 +4642,7 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) { +INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) { // FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities) Nd4jLong tempBuffer[2*MAX_RANK]; @@ -4651,11 +4651,11 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) { // exclude unities from shapeInfo const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides); - shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides); + shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides); } ////////////////////////////////////////////////////////////////////// -INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) { +INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) { const int rank = shape::rank(shapeInfo); @@ -4673,19 +4673,32 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch bool contiguous = true; - // *** check whether strides are in c contiguous order ***// - if(stridesNoUnities[numOfNonUnities - 1] != 1) // last stride should be always unity for c order - contiguous = false; - else { - for (uint i = 0; i < numOfNonUnities - 1; ++i) { - if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) { - contiguous = false; - break; - } + //*** check whether strides are in c contiguous order ***// + for (uint i = 0; i < numOfNonUnities - 1; ++i) { + if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) { + contiguous = false; + break; } } + if(contiguous) { - *shape::ews(shapeInfo) = 1; + + // for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4} + if(numOfNonUnities < rank) { // unities are present in shape + + int indNonUnit = rank - 1; + + while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1) + + for(int j = indNonUnit + 2; j < rank; ++j) + shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1]; + + for(int j = indNonUnit; j >= 0; --j) + if(shape::shapeOf(shapeInfo)[j] == 1) + shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1]; + } + + *shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1]; shapeInfo[rank * 2 + 3] = 99; return; } @@ -4693,18 +4706,31 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch contiguous = true; //*** check whether strides are in f contiguous order ***// - if(stridesNoUnities[0] != 1) // first stride should be always unity for f order - contiguous = false; - else { - for (uint i = 1; i < numOfNonUnities; ++i) { - if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) { - contiguous = false; - break; - } + for (uint i = 1; i < numOfNonUnities; ++i) { + if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) { + contiguous = false; + break; } } + if(contiguous) { - *shape::ews(shapeInfo) = 1; + + // for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4} + if(numOfNonUnities < rank) { // unities are present in shape + + int indNonUnit = 0; + + while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1) + + for(int j = 0; j < indNonUnit - 1; ++j) + shape::stride(shapeInfo)[j] = stridesNoUnities[0]; + + for(int j = indNonUnit; j < rank; ++j) + if(shape::shapeOf(shapeInfo)[j] == 1) + shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1]; + } + + *shape::ews(shapeInfo) = stridesNoUnities[0]; shapeInfo[rank * 2 + 3] = 102; return; } @@ -4756,7 +4782,7 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo shape::calcOffsets(dimsSize, shape, strides, subArrOffsets); // evaluate ews - shape::checkStridesSetEwsAndOrder(subArrShapeInfo); + shape::checkStridesEwsAndOrder(subArrShapeInfo); delete []strides; delete []shape; diff --git a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp index 2003eef3f..faa59fa6c 100644 --- a/libnd4j/include/ops/declarable/generic/transforms/concat.cpp +++ b/libnd4j/include/ops/declarable/generic/transforms/concat.cpp @@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) { std::vector arrsToDelete; int index = 0; bool allOfSameType = true; - auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0; - auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType(); + auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0; + auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType(); for(int i = 0; i < numOfInArrs; ++i) { auto input = INPUT_VARIABLE(i); @@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) { // TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy // REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank); -// REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank); +// REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr); if(!input->isEmpty()) { - allOfSameType &= (theFirstDatatype == input->dataType()); + allOfSameType &= (typeOfFirstArr == input->dataType()); if(input->rankOf() == 0) { auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext()); diff --git a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu index 43c0e4af9..b455ff659 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/concat.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/concat.cu @@ -85,38 +85,106 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid ////////////////////////////////////////////////////////////////////////// void concat(nd4j::LaunchContext * context, const std::vector& inArrs, NDArray& output, const int axis) { - const int threadsPerBlock = 256; - const int blocksPerGrid = 512; - const int sharedMem = 512; + const int numOfInArrs = inArrs.size(); + const auto sizeofT = output.sizeOfT(); - const int numOfArrs = inArrs.size(); - - for(int i = 0; i < numOfArrs; ++i) + for(int i = 0; i < numOfInArrs; ++i) inArrs[i]->syncToDevice(); - output.syncToDevice(); - // prepare arrays of pointers on buffers and shapes - std::vector hInBuffers(numOfArrs); - std::vector hInShapeInfo(numOfArrs); + bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1; - for(int i = 0; i < numOfArrs; ++i) { - hInBuffers[i] = inArrs[i]->getSpecialBuffer(); - hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo(); + if(luckCase1) { + for (uint i = 0; i < numOfInArrs; ++i) { + luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1; + if(!luckCase1) + break; + } } - PointersManager manager(context, "helpers::concat"); + if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f - void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*)); - void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*)); + void* z = static_cast(output.getSpecialBuffer()); - BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES); + for (uint i = 0; i < numOfInArrs; ++i) { + const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT; + cudaMemcpyAsync(z, static_cast(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream()); + z = static_cast(z) + memAmountToCopy; + } - manager.synchronize(); + if(cudaStreamSynchronize(*context->getCudaStream()) != 0) + throw std::runtime_error("concat cuda: luckCase1 failed!"); - for(int i = 0; i < numOfArrs; ++i) + for(int i = 0; i < numOfInArrs; ++i) + inArrs[i]->tickReadDevice(); + output.tickWriteDevice(); + + return; + } + + const bool isZcontin = output.strideAt(axis) == 1; + bool areInputsContin = true; + bool allSameOrder = true; + + if(isZcontin) { + for (uint i = 0; i < inArrs.size(); ++i) { + areInputsContin &= inArrs[i]->strideAt(axis) == 1; + allSameOrder &= output.ordering() == inArrs[i]->ordering(); + if(!areInputsContin || !allSameOrder) + break; + } + } + + const bool luckCase2 = isZcontin && areInputsContin && allSameOrder; + + if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array + + const uint zDim = output.sizeAt(axis); + + for (uint i = 0; i < output.lengthOf() / zDim; ++i) { + + const auto iShift = i * sizeofT; + void* z = static_cast(output.getSpecialBuffer()) + zDim * iShift; + + for (uint j = 0; j < numOfInArrs; ++j) { + const auto xDim = inArrs[j]->sizeAt(axis); + void* x = static_cast(inArrs[j]->getSpecialBuffer()) + xDim * iShift; + const auto memSizeToCopy = xDim * sizeofT; + cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream()); + z = static_cast(z) + memSizeToCopy; + } + } + + if(cudaStreamSynchronize(*context->getCudaStream()) != 0) + throw std::runtime_error("concat cuda: luckCase2 failed!"); + } + else { // general (slower) case + + const int threadsPerBlock = 256; + const int blocksPerGrid = 512; + const int sharedMem = 512; + + // prepare arrays of pointers on buffers and shapes + std::vector hInBuffers(numOfInArrs); + std::vector hInShapeInfo(numOfInArrs); + + for(int i = 0; i < numOfInArrs; ++i) { + hInBuffers[i] = inArrs[i]->getSpecialBuffer(); + hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo(); + } + + PointersManager manager(context, "helpers::concat"); + + void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*)); + void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*)); + + BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES); + + manager.synchronize(); + } + + for(int i = 0; i < numOfInArrs; ++i) inArrs[i]->tickReadDevice(); - output.tickWriteDevice(); } diff --git a/libnd4j/include/ops/impl/specials_single.hpp b/libnd4j/include/ops/impl/specials_single.hpp index 030e9c6d7..ad63ee490 100644 --- a/libnd4j/include/ops/impl/specials_single.hpp +++ b/libnd4j/include/ops/impl/specials_single.hpp @@ -31,81 +31,170 @@ #include namespace nd4j { - /** * Concatneate multi array of the same shape together * along a particular dimension */ +// template +// void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, NDArray& output, const int axis) { +// const uint numOfArrs = inArrs.size(); + +// int outDim; +// const bool isOutputVector = output.isCommonVector(outDim); + +// if(isOutputVector || (axis == 0 && output.ordering() == 'c')) { + +// bool allVectorsOrScalars = true; +// const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews(); + +// std::vector nonUnityDim(numOfArrs); +// std::vector zOffset(numOfArrs); + +// for(int i = 0; i < numOfArrs; i++) { +// allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i])); +// if(!allVectorsOrScalars) +// break; +// if(i == 0) zOffset[0] = 0; +// else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf(); +// } + +// if(allVectorsOrScalars) { + +// T* outBuff = output.bufferAsT(); + +// auto func = PRAGMA_THREADS_FOR { +// for (auto r = start; r < stop; r += increment) { +// const Nd4jLong arrLen = inArrs[r]->lengthOf(); +// const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; + +// T *z = outBuff + zOffset[r]; +// T *x = inArrs[r]->bufferAsT(); + +// if (outEws == 1 && xEws == 1) +// for (Nd4jLong e = 0; e < arrLen; e++) +// z[e] = x[e]; +// else +// for (Nd4jLong e = 0; e < arrLen; e++) +// z[e * outEws] = x[e * xEws]; +// } +// }; + +// samediff::Threads::parallel_tad(func, 0, numOfArrs); +// return; +// } +// } + +// const int rank = inArrs[0]->rankOf(); +// const int rank2 = 2*rank; +// std::vector> indices(numOfArrs, std::vector(rank2,0)); + +// // take into account indices for first array +// indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis); + +// // loop through the rest of input arrays +// for(int i = 1; i < numOfArrs; ++i) { +// indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from +// indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding) +// } + +// auto func = PRAGMA_THREADS_FOR { +// for (auto i = start; i < stop; i += increment) { +// auto temp = output(indices[i], true); +// nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); +// } +// }; + +// samediff::Threads::parallel_tad(func, 0, numOfArrs); +// } + template void SpecialMethods::concatCpuGeneric(const std::vector& inArrs, NDArray& output, const int axis) { - const uint numOfArrs = inArrs.size(); - int outDim; - const bool isOutputVector = output.isCommonVector(outDim); + const int numOfInArrs = inArrs.size(); + const auto sizeofT = output.sizeOfT(); - if(isOutputVector || (axis == 0 && output.ordering() == 'c')) { + T* zBuff = output.bufferAsT(); - bool allVectorsOrScalars = true; - const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews(); + bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1; - std::vector nonUnityDim(numOfArrs); - std::vector zOffset(numOfArrs); + if(luckCase1) { + for (uint i = 0; i < numOfInArrs; ++i) { + luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1; + if(!luckCase1) + break; + } + } - for(int i = 0; i < numOfArrs; i++) { - allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i])); - if(!allVectorsOrScalars) - break; - if(i == 0) zOffset[0] = 0; - else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf(); - } + if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f - if(allVectorsOrScalars) { + T* z = zBuff; + for (uint i = 0; i < numOfInArrs; ++i) { + const auto memAmountToCopy = inArrs[i]->lengthOf(); + memcpy(z, inArrs[i]->bufferAsT(), memAmountToCopy * sizeofT); + z += memAmountToCopy; + } + return; + } - T* outBuff = output.bufferAsT(); + const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c'; + bool areInputsContin = true; + bool allSameOrder = true; - auto func = PRAGMA_THREADS_FOR { - for (auto r = start; r < stop; r++) { - const Nd4jLong arrLen = inArrs[r]->lengthOf(); - const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]]; + if(isZcontin) { + for (uint i = 0; i < numOfInArrs; ++i) { + areInputsContin &= inArrs[i]->strideAt(axis) == 1; + allSameOrder &= inArrs[i]->ordering() == output.ordering(); + if(!areInputsContin || !allSameOrder) + break; + } + } - T *z = outBuff + zOffset[r]; - T *x = inArrs[r]->bufferAsT(); + const bool luckCase2 = isZcontin && areInputsContin && allSameOrder; - if (outEws == 1 && xEws == 1) - for (Nd4jLong e = 0; e < arrLen; e++) - z[e] = x[e]; - else - for (Nd4jLong e = 0; e < arrLen; e++) - z[e * outEws] = x[e * xEws]; - } - }; + if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array - samediff::Threads::parallel_tad(func, 0, numOfArrs); - return; + const uint zDim = output.sizeAt(axis); + + for (uint i = 0; i < output.lengthOf() / zDim; ++i) { + T* z = zBuff + zDim * i; + + for (uint j = 0; j < inArrs.size(); ++j) { + const auto xDim = inArrs[j]->sizeAt(axis); + const T* x = inArrs[j]->bufferAsT() + xDim * i; + memcpy(z, x, xDim * sizeofT); + z += xDim; } } - const int rank = inArrs[0]->rankOf(); - const int rank2 = 2*rank; - std::vector> indices(numOfArrs, std::vector(rank2,0)); + return; + } - // take into account indices for first array - indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis); + // general case + auto func = PRAGMA_THREADS_FOR { - // loop through the rest of input arrays - for(int i = 1; i < numOfArrs; ++i) { - indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from - indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding) - } + Nd4jLong coords[MAX_RANK]; - auto func = PRAGMA_THREADS_FOR { - for (auto i = start; i < stop; i++) { - auto temp = output(indices[i], true); - nd4j::TransformLoops::template loopTransform>( inArrs[i]->bufferAsT(), inArrs[i]->getShapeInfo(), temp.bufferAsT(), temp.getShapeInfo(), nullptr, 0, 1); + for (auto i = start; i < stop; i += increment) { + + shape::index2coords(i, output.getShapeInfo(), coords); + const auto zOffset = shape::getOffset(output.getShapeInfo(), coords); + + uint inArrIdx = 0; + uint xDim = inArrs[inArrIdx]->sizeAt(axis); + + while (coords[axis] >= xDim) { + coords[axis] -= xDim; + xDim = inArrs[++inArrIdx]->sizeAt(axis); } - }; - samediff::Threads::parallel_tad(func, 0, numOfArrs); + const T* x = inArrs[inArrIdx]->bufferAsT(); + const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords); + + zBuff[zOffset] = x[xOffset]; + } + }; + + samediff::Threads::parallel_for(func, 0, output.lengthOf()); } /** @@ -128,6 +217,7 @@ void SpecialMethods::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint delete inputs[i]; } + /** * This kernel accumulates X arrays, and stores result into Z * diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp index 77634b052..773e1dc18 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests9.cpp @@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) { ASSERT_EQ(ND4J_STATUS_OK, result->status()); auto output = result->at(0); + output->printBuffer(); + ASSERT_TRUE(exp.isSameShape(output)); ASSERT_TRUE(exp.equalsTo(output)); @@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) { // we crate bunch of arrays, filled with specific values for (int e = 0; e < 2000; e++) { - auto array = NDArrayFactory::create_('c', {1, 300}); + auto array = NDArrayFactory::create_('c', {1, 300}); array->assign(e); context.setInputArray(e, array, true); } - auto z = NDArrayFactory::create('c', {2000, 300}); + auto z = NDArrayFactory::create('c', {2000, 300}); context.setOutputArray(0, &z, false); context.setIArguments(&axis, 1); @@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) { op.execute(&context); for (int e = 0; e < 2000; e++) { + auto exp = NDArrayFactory::create('c', {300}); + exp.assign(e); auto row = z.tensorAlongDimension(e, {1}); - ASSERT_NEAR((float) e, row.e(0), 1e-5f); + ASSERT_EQ(exp, row); } } diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp index e3dc1aefc..6d5366396 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/NDArrayTests2.cpp @@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) { float buffExpX3[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000}; Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99}; float buffExpX4[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000}; - Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99}; + Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99}; float buffExpX5[] = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000}; Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102}; diff --git a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp index a8f430fe3..fb0d7991a 100644 --- a/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp +++ b/libnd4j/tests_cpu/layers_tests/ShapeTests2.cpp @@ -43,7 +43,7 @@ public: Nd4jLong shape[3] = {3,4,5}; Nd4jLong *shapeBuffer; ThreeDTest() { - shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); + shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); } ~ThreeDTest() { delete[] shapeBuffer; @@ -196,11 +196,11 @@ public: int dimensionLength = 2; int dimension[2] = {2,3}; Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99}; - Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102}; + Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102}; }; -TEST_F(LeadingOnes,OnesTest) { +TEST_F(LeadingOnes,OnesTest) { shape::TAD *cTad = new shape::TAD; cTad->init(shapeBufferC,dimension,dimensionLength); @@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) { class NormalThreeFourFive : public testing::Test { public: - Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102}; + Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99}; Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99}; int dimensionLength = 2; int dimension[2] = {0,1}; @@ -243,7 +243,7 @@ class DimensionWarning : public testing::Test { public: int dimensionLength = 2; int dimensions[2] = {0,1}; - Nd4jLong shape[3] = {1,5,1}; + Nd4jLong shape[3] = {1,5,1}; Nd4jLong *shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); ~DimensionWarning() { @@ -324,7 +324,7 @@ public: int dimensionFour = 0; int dimensionLength = 1; FourDTest() { - threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape); + threeDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 3, threeDShape); fourDShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'f', 4, fourDShape); } ~FourDTest() { @@ -491,7 +491,7 @@ TEST_F(LabelTest,LabelTad) { delete tad; } -TEST_F(ExpectedValuesTest,TadTest) { +TEST_F(ExpectedValuesTest,TadTest) { auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, mainShape); shape::TAD *tad = new shape::TAD; tad->init(shapeBuffer,testDimensions,3); @@ -528,7 +528,7 @@ TEST_F(ThreeDTest,TensorAlongDimensionTest) { } -TEST_F(NumTadTests,TadTest) { +TEST_F(NumTadTests,TadTest) { auto shape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, this->shape); shape::TAD *tad = new shape::TAD; tad->init(shape,&dimension,1); @@ -539,7 +539,7 @@ TEST_F(NumTadTests,TadTest) { } TEST_F(TADStall,TestStall) { - auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); + auto shapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); shape::TAD *tad = new shape::TAD; tad->init(0,shapeInfo,this->dimensions,3); tad->createTadOnlyShapeInfo(); @@ -564,7 +564,7 @@ TEST_F(PermuteTest,PermuteShapeBufferTest) { Nd4jLong shapeToPermute[4] = {5,3,2,6}; Nd4jLong permutedOrder[4] = {6,2,3,5}; auto shapeBufferOriginal = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute); - auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute); + auto assertionShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shapeToPermute); shape::permuteShapeBufferInPlace(shapeBufferOriginal,normalOrder,shapeBufferOriginal); EXPECT_TRUE(arrsEquals(4,assertionShapeBuffer,shapeBufferOriginal)); @@ -585,9 +585,9 @@ TEST_F(ElementWiseStrideTest,ElementWiseStrideTest) { TEST_F(SliceVectorTest,RowColumnVectorTest) { Nd4jLong rowVectorShape[2] = {1,5}; - auto rowVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape); + auto rowVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape); Nd4jLong colVectorShape[2] = {5,1}; - auto colVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape); + auto colVectorShapeInfo = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, colVectorShape); Nd4jLong *sliceRow = shape::sliceOfShapeBuffer(0,rowVectorShapeInfo); EXPECT_TRUE(arrsEquals(2,rowVectorShapeInfo,sliceRow)); Nd4jLong *scalarSliceInfo = shape::createScalarShapeInfo(); @@ -608,7 +608,7 @@ TEST_F(SliceTensorTest,TestSlice) { Nd4jLong shape[3] = {3,3,2}; auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); Nd4jLong sliceShape[2] = {3,2}; - auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); + auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer); EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice)); delete[] testSlice; @@ -619,9 +619,9 @@ TEST_F(SliceTensorTest,TestSlice) { TEST_F(SliceMatrixTest,TestSlice) { Nd4jLong shape[2] = {3,2}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); Nd4jLong sliceShape[2] = {1,2}; - auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); + auto sliceShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, sliceShape); Nd4jLong *testSlice = shape::sliceOfShapeBuffer(0,shapeBuffer); EXPECT_TRUE(arrsEquals(2,sliceShapeBuffer,testSlice)); delete[] testSlice; @@ -664,13 +664,13 @@ TEST_F(TensorTwoFromFourDDimTest,TadTwoFromFourDimTest) { //Along dimension 1,2: expect matrix with shape [cols,dim2] //Along dimension 1,3: expect matrix with shape [cols,dim3] //Along dimension 2,3: expect matrix with shape [dim2,dim3] - auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); + auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 4, shape); for(int i = 0; i < 3; i++) { int *dimArr = dims[i]; Nd4jLong *expectedShape = expectedShapes[i]; shape::TAD *tad = new shape::TAD; tad->init(baseShapeBuffer,dimArr,dimensionLength); - auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); + auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); tad->createTadOnlyShapeInfo(); Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo; EXPECT_TRUE(arrsEquals(shape::rank(expectedShapeBuffer),expectedShape,shape::shapeOf(testShapeBuffer))); @@ -687,14 +687,14 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) { //Along dimension 0,1: expect matrix with shape [rows,cols] //Along dimension 0,2: expect matrix with shape [rows,dim2] //Along dimension 1,2: expect matrix with shape [cols,dim2] - auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); + auto baseShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 3, shape); for(int i = 0; i < 3; i++) { int *dimArr = dims[i]; Nd4jLong *expectedShape = expectedShapes[i]; shape::TAD *tad = new shape::TAD; tad->init(baseShapeBuffer,dimArr,dimensionLength); - auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); + auto expectedShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', dimensionLength, expectedShape); tad->createTadOnlyShapeInfo(); Nd4jLong *testShapeBuffer = tad->tadOnlyShapeInfo; Nd4jLong *expectedStride = expectedStrides[i]; @@ -715,7 +715,7 @@ TEST_F(TensorTwoDimTest,TadTwoDimTest) { TEST_F(TensorOneDimTest,TadDimensionsForTensor) { Nd4jLong shape[3] = {rows,cols,dim2}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); for(int i = 0; i < rank; i++) { //Along dimension 0: expect row vector with length 'dims[i]' @@ -737,14 +737,14 @@ TEST_F(TensorOneDimTest,TadDimensionsForTensor) { TEST_F(MatrixTest,TadDimensionsForMatrix) { Nd4jLong shape[2] = {rows,cols}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', rank, shape); shape::TAD *dimZero = new shape::TAD; dimZero->init(shapeBuffer,&dims[0],1); shape::TAD *dimOne = new shape::TAD; dimOne->init(shapeBuffer,&dims[1],1); //Along dimension 0: expect row vector with length 'rows' - Nd4jLong rowVectorShape[2] = {1,rows}; + Nd4jLong rowVectorShape[2] = {1,rows}; auto expectedDimZeroShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorShape); dimZero->createTadOnlyShapeInfo(); Nd4jLong *testDimZero = dimZero->tadOnlyShapeInfo; @@ -753,7 +753,7 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) { delete[] expectedDimZeroShape; //Along dimension 1: expect row vector with length 'cols' - Nd4jLong rowVectorColShape[2] {1,cols}; + Nd4jLong rowVectorColShape[2] {1,cols}; auto expectedDimOneShape = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVectorColShape); dimOne->createTadOnlyShapeInfo(); Nd4jLong *testDimOneShape = dimOne->tadOnlyShapeInfo; @@ -767,12 +767,12 @@ TEST_F(MatrixTest,TadDimensionsForMatrix) { } TEST_F(VectorTest,VectorTadShape) { - Nd4jLong rowVector[2] = {2,2}; + Nd4jLong rowVector[2] = {2,2}; auto rowBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, rowVector); int rowDimension = 1; Nd4jLong columnVector[2] = {2,2}; - auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector); + auto colShapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, columnVector); int colDimension = 0; @@ -811,7 +811,7 @@ TEST_F(VectorTest,LinspaceCombinationTest) { int len = rows * cols; double *linspaced = linspace(1,rows * cols,len); Nd4jLong shape[2] = {rows,cols}; - auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); + auto shapeBuffer = nd4j::ShapeBuilders::createShapeInfo(nd4j::DataType::FLOAT32, 'c', 2, shape); delete[] shapeBuffer; delete[] linspaced; diff --git a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java index c8b15c1a2..db2c941e9 100644 --- a/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java +++ b/nd4j/nd4j-backends/nd4j-backend-impls/nd4j-cuda/src/main/java/org/nd4j/nativeblas/Nd4jCuda.java @@ -7742,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432; * @return the double at the specified index */ - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/); - @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/); + @Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords); @Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank); @Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank); diff --git a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java index 90e9015b1..bad97296f 100644 --- a/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java +++ b/nd4j/nd4j-backends/nd4j-tests/src/test/java/org/nd4j/linalg/shape/concat/ConcatTestsC.java @@ -16,6 +16,7 @@ package org.nd4j.linalg.shape.concat; +import lombok.extern.slf4j.Slf4j; import lombok.val; import org.junit.Ignore; import org.junit.Test; @@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue; /** * @author Adam Gibson */ +@Slf4j @RunWith(Parameterized.class) public class ConcatTestsC extends BaseNd4jTest { @@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest { for (int e = 0; e < 20000; e++) list.add(Nd4j.create(DataType.INT, 1, 300).assign(e)); + val timeStart = System.nanoTime(); val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()])); + val timeEnd = System.nanoTime(); + + log.info("Time: {} us", (timeEnd - timeStart) / 1000); for (int e = 0; e < 20000; e++) assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f);