profiling of concat op (both cuda and cpu) (#151)

* - profiling of concat op (both cuda and cpu)

Signed-off-by: Yurii <iuriish@yahoo.com>

* better comparison for large concat

Signed-off-by: raver119 <raver119@gmail.com>

* - further improving of concat op

Signed-off-by: Yurii <iuriish@yahoo.com>

* some loggin

Signed-off-by: raver119 <raver119@gmail.com>

* - add possibility to verify presence of trailing unities in shape and set strides/ews correspondingly
- restrict second simple case in concat op to c order only

Signed-off-by: Yurii <iuriish@yahoo.com>

* - move concat op to specials_single.cpp file

Signed-off-by: Yurii <iuriish@yahoo.com>

* - get rid of second concat op declaration in transforms.cpp file

Signed-off-by: Yurii <iuriish@yahoo.com>

Co-authored-by: raver119 <raver119@gmail.com>
master
Yurii Shyrma 2020-02-20 20:19:01 +02:00 committed by GitHub
parent 215641ea9e
commit f7a9190407
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 348 additions and 154 deletions

View File

@ -4866,7 +4866,7 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
}
}
Nd4jLong *shapeInfoNoUnities = newShapeInfo;
Nd4jLong *newShapeInfo2 = newShapeInfo;
if(!keepUnitiesInShape) {
@ -4877,18 +4877,18 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
dimsWithUnities.push_back(d);
if(!dimsWithUnities.empty())
shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
}
// check if there is possibility to set ews = 1
shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities);
shape::checkStridesEwsAndOrder(newShapeInfo2);
NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset());
NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset());
result._isView = true;
RELEASE(newShapeInfo, getContext()->getWorkspace());
if(newShapeInfo != shapeInfoNoUnities)
RELEASE(shapeInfoNoUnities, getContext()->getWorkspace());
if(newShapeInfo != newShapeInfo2)
RELEASE(newShapeInfo2, getContext()->getWorkspace());
return result;
}

View File

@ -900,9 +900,9 @@ namespace shape {
* @return the double at the specified index
*/
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0);
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0);
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);
ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);
@ -1014,8 +1014,8 @@ namespace shape {
// if array is scalar or unit length vector then ews = 1 and order is preserved
// if array is common vector then ews = stride of non-unity dimension and order is preserved
// if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo);
ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo);
/**
* processes whole set of sub-arrays
@ -1041,7 +1041,7 @@ namespace shape {
ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);
/**
* for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
* for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
* then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
*/
INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
@ -2071,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank];
}
shape::checkStridesSetEwsAndOrder(shapeInfo);
shape::checkStridesEwsAndOrder(shapeInfo);
delete[] temp;
}
@ -2483,7 +2483,7 @@ INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape)
newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer);
// correct order and ews if necessary
shape::checkStridesSetEwsAndOrder(newShapeBuffer);
shape::checkStridesEwsAndOrder(newShapeBuffer);
delete[] indices;
@ -4092,7 +4092,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
// set ews
if(oldEws == 0)
shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order
shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order
else {
newShapeInfo[2 * newRank + 3] = oldOrder; // order
*shape::ews(newShapeInfo) = oldEws; // ews
@ -4642,7 +4642,7 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong
}
//////////////////////////////////////////////////////////////////////
INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) {
// FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
Nd4jLong tempBuffer[2*MAX_RANK];
@ -4651,11 +4651,11 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
// exclude unities from shapeInfo
const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides);
shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
}
//////////////////////////////////////////////////////////////////////
INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
const int rank = shape::rank(shapeInfo);
@ -4673,19 +4673,32 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
bool contiguous = true;
// *** check whether strides are in c contiguous order ***//
if(stridesNoUnities[numOfNonUnities - 1] != 1) // last stride should be always unity for c order
contiguous = false;
else {
//*** check whether strides are in c contiguous order ***//
for (uint i = 0; i < numOfNonUnities - 1; ++i) {
if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) {
if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) {
contiguous = false;
break;
}
}
}
if(contiguous) {
*shape::ews(shapeInfo) = 1;
// for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4}
if(numOfNonUnities < rank) { // unities are present in shape
int indNonUnit = rank - 1;
while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1)
for(int j = indNonUnit + 2; j < rank; ++j)
shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1];
for(int j = indNonUnit; j >= 0; --j)
if(shape::shapeOf(shapeInfo)[j] == 1)
shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1];
}
*shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1];
shapeInfo[rank * 2 + 3] = 99;
return;
}
@ -4693,18 +4706,31 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
contiguous = true;
//*** check whether strides are in f contiguous order ***//
if(stridesNoUnities[0] != 1) // first stride should be always unity for f order
contiguous = false;
else {
for (uint i = 1; i < numOfNonUnities; ++i) {
if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) {
if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) {
contiguous = false;
break;
}
}
}
if(contiguous) {
*shape::ews(shapeInfo) = 1;
// for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4}
if(numOfNonUnities < rank) { // unities are present in shape
int indNonUnit = 0;
while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1)
for(int j = 0; j < indNonUnit - 1; ++j)
shape::stride(shapeInfo)[j] = stridesNoUnities[0];
for(int j = indNonUnit; j < rank; ++j)
if(shape::shapeOf(shapeInfo)[j] == 1)
shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1];
}
*shape::ews(shapeInfo) = stridesNoUnities[0];
shapeInfo[rank * 2 + 3] = 102;
return;
}
@ -4756,7 +4782,7 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
shape::calcOffsets(dimsSize, shape, strides, subArrOffsets);
// evaluate ews
shape::checkStridesSetEwsAndOrder(subArrShapeInfo);
shape::checkStridesEwsAndOrder(subArrShapeInfo);
delete []strides;
delete []shape;

View File

@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
std::vector<int> arrsToDelete;
int index = 0;
bool allOfSameType = true;
auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
for(int i = 0; i < numOfInArrs; ++i) {
auto input = INPUT_VARIABLE(i);
@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
// TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy
// REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank);
// REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank);
// REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr);
if(!input->isEmpty()) {
allOfSameType &= (theFirstDatatype == input->dataType());
allOfSameType &= (typeOfFirstArr == input->dataType());
if(input->rankOf() == 0) {
auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext());

View File

@ -85,22 +85,90 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid
//////////////////////////////////////////////////////////////////////////
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
const int numOfInArrs = inArrs.size();
const auto sizeofT = output.sizeOfT();
for(int i = 0; i < numOfInArrs; ++i)
inArrs[i]->syncToDevice();
output.syncToDevice();
bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
if(luckCase1) {
for (uint i = 0; i < numOfInArrs; ++i) {
luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
if(!luckCase1)
break;
}
}
if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
void* z = static_cast<int8_t*>(output.getSpecialBuffer());
for (uint i = 0; i < numOfInArrs; ++i) {
const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT;
cudaMemcpyAsync(z, static_cast<int8_t*>(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
z = static_cast<int8_t*>(z) + memAmountToCopy;
}
if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
throw std::runtime_error("concat cuda: luckCase1 failed!");
for(int i = 0; i < numOfInArrs; ++i)
inArrs[i]->tickReadDevice();
output.tickWriteDevice();
return;
}
const bool isZcontin = output.strideAt(axis) == 1;
bool areInputsContin = true;
bool allSameOrder = true;
if(isZcontin) {
for (uint i = 0; i < inArrs.size(); ++i) {
areInputsContin &= inArrs[i]->strideAt(axis) == 1;
allSameOrder &= output.ordering() == inArrs[i]->ordering();
if(!areInputsContin || !allSameOrder)
break;
}
}
const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
const uint zDim = output.sizeAt(axis);
for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
const auto iShift = i * sizeofT;
void* z = static_cast<int8_t*>(output.getSpecialBuffer()) + zDim * iShift;
for (uint j = 0; j < numOfInArrs; ++j) {
const auto xDim = inArrs[j]->sizeAt(axis);
void* x = static_cast<int8_t*>(inArrs[j]->getSpecialBuffer()) + xDim * iShift;
const auto memSizeToCopy = xDim * sizeofT;
cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
z = static_cast<int8_t*>(z) + memSizeToCopy;
}
}
if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
throw std::runtime_error("concat cuda: luckCase2 failed!");
}
else { // general (slower) case
const int threadsPerBlock = 256;
const int blocksPerGrid = 512;
const int sharedMem = 512;
const int numOfArrs = inArrs.size();
for(int i = 0; i < numOfArrs; ++i)
inArrs[i]->syncToDevice();
output.syncToDevice();
// prepare arrays of pointers on buffers and shapes
std::vector<void*> hInBuffers(numOfArrs);
std::vector<Nd4jLong*> hInShapeInfo(numOfArrs);
std::vector<void*> hInBuffers(numOfInArrs);
std::vector<Nd4jLong*> hInShapeInfo(numOfInArrs);
for(int i = 0; i < numOfArrs; ++i) {
for(int i = 0; i < numOfInArrs; ++i) {
hInBuffers[i] = inArrs[i]->getSpecialBuffer();
hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
}
@ -113,10 +181,10 @@ void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs,
BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
manager.synchronize();
}
for(int i = 0; i < numOfArrs; ++i)
for(int i = 0; i < numOfInArrs; ++i)
inArrs[i]->tickReadDevice();
output.tickWriteDevice();
}

View File

@ -31,81 +31,170 @@
#include <helpers/Loops.h>
namespace nd4j {
/**
* Concatneate multi array of the same shape together
* along a particular dimension
*/
// template <typename T>
// void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
// const uint numOfArrs = inArrs.size();
// int outDim;
// const bool isOutputVector = output.isCommonVector(outDim);
// if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
// bool allVectorsOrScalars = true;
// const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
// std::vector<int> nonUnityDim(numOfArrs);
// std::vector<Nd4jLong> zOffset(numOfArrs);
// for(int i = 0; i < numOfArrs; i++) {
// allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
// if(!allVectorsOrScalars)
// break;
// if(i == 0) zOffset[0] = 0;
// else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
// }
// if(allVectorsOrScalars) {
// T* outBuff = output.bufferAsT<T>();
// auto func = PRAGMA_THREADS_FOR {
// for (auto r = start; r < stop; r += increment) {
// const Nd4jLong arrLen = inArrs[r]->lengthOf();
// const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
// T *z = outBuff + zOffset[r];
// T *x = inArrs[r]->bufferAsT<T>();
// if (outEws == 1 && xEws == 1)
// for (Nd4jLong e = 0; e < arrLen; e++)
// z[e] = x[e];
// else
// for (Nd4jLong e = 0; e < arrLen; e++)
// z[e * outEws] = x[e * xEws];
// }
// };
// samediff::Threads::parallel_tad(func, 0, numOfArrs);
// return;
// }
// }
// const int rank = inArrs[0]->rankOf();
// const int rank2 = 2*rank;
// std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
// // take into account indices for first array
// indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
// // loop through the rest of input arrays
// for(int i = 1; i < numOfArrs; ++i) {
// indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from
// indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding)
// }
// auto func = PRAGMA_THREADS_FOR {
// for (auto i = start; i < stop; i += increment) {
// auto temp = output(indices[i], true);
// nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
// }
// };
// samediff::Threads::parallel_tad(func, 0, numOfArrs);
// }
template <typename T>
void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
const uint numOfArrs = inArrs.size();
int outDim;
const bool isOutputVector = output.isCommonVector(outDim);
const int numOfInArrs = inArrs.size();
const auto sizeofT = output.sizeOfT();
if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
T* zBuff = output.bufferAsT<T>();
bool allVectorsOrScalars = true;
const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
std::vector<int> nonUnityDim(numOfArrs);
std::vector<Nd4jLong> zOffset(numOfArrs);
for(int i = 0; i < numOfArrs; i++) {
allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
if(!allVectorsOrScalars)
if(luckCase1) {
for (uint i = 0; i < numOfInArrs; ++i) {
luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
if(!luckCase1)
break;
if(i == 0) zOffset[0] = 0;
else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
}
}
if(allVectorsOrScalars) {
if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
T* outBuff = output.bufferAsT<T>();
auto func = PRAGMA_THREADS_FOR {
for (auto r = start; r < stop; r++) {
const Nd4jLong arrLen = inArrs[r]->lengthOf();
const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
T *z = outBuff + zOffset[r];
T *x = inArrs[r]->bufferAsT<T>();
if (outEws == 1 && xEws == 1)
for (Nd4jLong e = 0; e < arrLen; e++)
z[e] = x[e];
else
for (Nd4jLong e = 0; e < arrLen; e++)
z[e * outEws] = x[e * xEws];
T* z = zBuff;
for (uint i = 0; i < numOfInArrs; ++i) {
const auto memAmountToCopy = inArrs[i]->lengthOf();
memcpy(z, inArrs[i]->bufferAsT<T>(), memAmountToCopy * sizeofT);
z += memAmountToCopy;
}
};
samediff::Threads::parallel_tad(func, 0, numOfArrs);
return;
}
const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c';
bool areInputsContin = true;
bool allSameOrder = true;
if(isZcontin) {
for (uint i = 0; i < numOfInArrs; ++i) {
areInputsContin &= inArrs[i]->strideAt(axis) == 1;
allSameOrder &= inArrs[i]->ordering() == output.ordering();
if(!areInputsContin || !allSameOrder)
break;
}
}
const int rank = inArrs[0]->rankOf();
const int rank2 = 2*rank;
std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
// take into account indices for first array
indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
// loop through the rest of input arrays
for(int i = 1; i < numOfArrs; ++i) {
indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from
indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding)
const uint zDim = output.sizeAt(axis);
for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
T* z = zBuff + zDim * i;
for (uint j = 0; j < inArrs.size(); ++j) {
const auto xDim = inArrs[j]->sizeAt(axis);
const T* x = inArrs[j]->bufferAsT<T>() + xDim * i;
memcpy(z, x, xDim * sizeofT);
z += xDim;
}
}
return;
}
// general case
auto func = PRAGMA_THREADS_FOR {
for (auto i = start; i < stop; i++) {
auto temp = output(indices[i], true);
nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
Nd4jLong coords[MAX_RANK];
for (auto i = start; i < stop; i += increment) {
shape::index2coords(i, output.getShapeInfo(), coords);
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
uint inArrIdx = 0;
uint xDim = inArrs[inArrIdx]->sizeAt(axis);
while (coords[axis] >= xDim) {
coords[axis] -= xDim;
xDim = inArrs[++inArrIdx]->sizeAt(axis);
}
const T* x = inArrs[inArrIdx]->bufferAsT<T>();
const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
zBuff[zOffset] = x[xOffset];
}
};
samediff::Threads::parallel_tad(func, 0, numOfArrs);
samediff::Threads::parallel_for(func, 0, output.lengthOf());
}
/**
@ -128,6 +217,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
delete inputs[i];
}
/**
* This kernel accumulates X arrays, and stores result into Z
*

View File

@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
ASSERT_EQ(ND4J_STATUS_OK, result->status());
auto output = result->at(0);
output->printBuffer();
ASSERT_TRUE(exp.isSameShape(output));
ASSERT_TRUE(exp.equalsTo(output));
@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
// we crate bunch of arrays, filled with specific values
for (int e = 0; e < 2000; e++) {
auto array = NDArrayFactory::create_<float>('c', {1, 300});
auto array = NDArrayFactory::create_<int>('c', {1, 300});
array->assign(e);
context.setInputArray(e, array, true);
}
auto z = NDArrayFactory::create<float>('c', {2000, 300});
auto z = NDArrayFactory::create<int>('c', {2000, 300});
context.setOutputArray(0, &z, false);
context.setIArguments(&axis, 1);
@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
op.execute(&context);
for (int e = 0; e < 2000; e++) {
auto exp = NDArrayFactory::create<int>('c', {300});
exp.assign(e);
auto row = z.tensorAlongDimension(e, {1});
ASSERT_NEAR((float) e, row.e<float>(0), 1e-5f);
ASSERT_EQ(exp, row);
}
}

View File

@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) {
float buffExpX3[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99};
float buffExpX4[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99};
Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99};
float buffExpX5[] = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000};
Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102};

View File

@ -196,7 +196,7 @@ public:
int dimensionLength = 2;
int dimension[2] = {2,3};
Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99};
Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102};
Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102};
};
@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) {
class NormalThreeFourFive : public testing::Test {
public:
Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102};
Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99};
Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99};
int dimensionLength = 2;
int dimension[2] = {0,1};

View File

@ -7742,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432;
* @return the double at the specified index
*/
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords);
@Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank);
@Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank);

View File

@ -16,6 +16,7 @@
package org.nd4j.linalg.shape.concat;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.junit.Ignore;
import org.junit.Test;
@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue;
/**
* @author Adam Gibson
*/
@Slf4j
@RunWith(Parameterized.class)
public class ConcatTestsC extends BaseNd4jTest {
@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest {
for (int e = 0; e < 20000; e++)
list.add(Nd4j.create(DataType.INT, 1, 300).assign(e));
val timeStart = System.nanoTime();
val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()]));
val timeEnd = System.nanoTime();
log.info("Time: {} us", (timeEnd - timeStart) / 1000);
for (int e = 0; e < 20000; e++)
assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f);