profiling of concat op (both cuda and cpu) (#151)
* - profiling of concat op (both cuda and cpu) Signed-off-by: Yurii <iuriish@yahoo.com> * better comparison for large concat Signed-off-by: raver119 <raver119@gmail.com> * - further improving of concat op Signed-off-by: Yurii <iuriish@yahoo.com> * some loggin Signed-off-by: raver119 <raver119@gmail.com> * - add possibility to verify presence of trailing unities in shape and set strides/ews correspondingly - restrict second simple case in concat op to c order only Signed-off-by: Yurii <iuriish@yahoo.com> * - move concat op to specials_single.cpp file Signed-off-by: Yurii <iuriish@yahoo.com> * - get rid of second concat op declaration in transforms.cpp file Signed-off-by: Yurii <iuriish@yahoo.com> Co-authored-by: raver119 <raver119@gmail.com>master
parent
215641ea9e
commit
f7a9190407
|
@ -4866,7 +4866,7 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
|
|||
}
|
||||
}
|
||||
|
||||
Nd4jLong *shapeInfoNoUnities = newShapeInfo;
|
||||
Nd4jLong *newShapeInfo2 = newShapeInfo;
|
||||
|
||||
if(!keepUnitiesInShape) {
|
||||
|
||||
|
@ -4877,18 +4877,18 @@ NDArray NDArray::operator()(const std::vector<Nd4jLong>& idx, const bool keepUni
|
|||
dimsWithUnities.push_back(d);
|
||||
|
||||
if(!dimsWithUnities.empty())
|
||||
shapeInfoNoUnities = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
|
||||
newShapeInfo2 = ShapeBuilders::copyShapeInfoWithoutUnites(newShapeInfo, dimsWithUnities.size(), dimsWithUnities.data(), getContext()->getWorkspace());
|
||||
}
|
||||
|
||||
// check if there is possibility to set ews = 1
|
||||
shape::checkStridesSetEwsAndOrder(shapeInfoNoUnities);
|
||||
shape::checkStridesEwsAndOrder(newShapeInfo2);
|
||||
|
||||
NDArray result(_buffer, ShapeDescriptor(shapeInfoNoUnities), getContext(), offset + getBufferOffset());
|
||||
NDArray result(_buffer, ShapeDescriptor(newShapeInfo2), getContext(), offset + getBufferOffset());
|
||||
result._isView = true;
|
||||
|
||||
RELEASE(newShapeInfo, getContext()->getWorkspace());
|
||||
if(newShapeInfo != shapeInfoNoUnities)
|
||||
RELEASE(shapeInfoNoUnities, getContext()->getWorkspace());
|
||||
if(newShapeInfo != newShapeInfo2)
|
||||
RELEASE(newShapeInfo2, getContext()->getWorkspace());
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -900,9 +900,9 @@ namespace shape {
|
|||
* @return the double at the specified index
|
||||
*/
|
||||
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *indices, Nd4jLong baseOffset = 0);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *indices, Nd4jLong baseOffset = 0);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *indices, Nd4jLong baseOffset = 0);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const Nd4jLong *coords, Nd4jLong baseOffset = 0);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const int *coords, Nd4jLong baseOffset = 0);
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong getOffset(const Nd4jLong *shapeInfo, const uint *coords, Nd4jLong baseOffset = 0);
|
||||
|
||||
ND4J_EXPORT _CUDA_HD Nd4jLong* createShapeInfo(Nd4jLong *shape, Nd4jLong *stride, int rank);
|
||||
|
||||
|
@ -1014,8 +1014,8 @@ namespace shape {
|
|||
// if array is scalar or unit length vector then ews = 1 and order is preserved
|
||||
// if array is common vector then ews = stride of non-unity dimension and order is preserved
|
||||
// if strides are normal/contiguous then ews = 1 and corresponding order is set, otherwise ews = 0 and order is preserved
|
||||
ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
|
||||
ND4J_EXPORT _CUDA_HD void checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo);
|
||||
ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnitDims, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities);
|
||||
ND4J_EXPORT _CUDA_HD void checkStridesEwsAndOrder(Nd4jLong* shapeInfo);
|
||||
|
||||
/**
|
||||
* processes whole set of sub-arrays
|
||||
|
@ -1041,7 +1041,7 @@ namespace shape {
|
|||
ND4J_EXPORT _CUDA_HD int excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, Nd4jLong*& shapeNoUnities, Nd4jLong*& stridesNoUnities);
|
||||
|
||||
/**
|
||||
* for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {2,3}, dimsSize = 2
|
||||
* for example inShapeInfo is {3, 2,1,3,1,4, 12,12,4,4,1, 16384,1,99}, dimsToExclude = {1,3}, dimsSize = 2
|
||||
* then outShapeInfo will contain {3, 2,3,4, 12,4,1, 16384,1,99}
|
||||
*/
|
||||
INLINEDEF _CUDA_HD void excludeUnitiesFromShapeInfo(const Nd4jLong* inShapeInfo, const int dimsSize, const int* dimsToExclude, Nd4jLong* outShapeInfo);
|
||||
|
@ -2071,7 +2071,7 @@ INLINEDEF _CUDA_HD Nd4jLong indexOffset(Nd4jLong index, const Nd4jLong* lShapeIn
|
|||
shapeInfo[i + 1 + rank] = temp[rearrange[i] + 1 + rank];
|
||||
}
|
||||
|
||||
shape::checkStridesSetEwsAndOrder(shapeInfo);
|
||||
shape::checkStridesEwsAndOrder(shapeInfo);
|
||||
|
||||
delete[] temp;
|
||||
}
|
||||
|
@ -2483,7 +2483,7 @@ INLINEDEF _CUDA_HD int numOfNonUnitDims(const int rank, const Nd4jLong* inShape)
|
|||
newShapeBuffer[2 * newRank + 3] = shape::order(shapeBuffer);
|
||||
|
||||
// correct order and ews if necessary
|
||||
shape::checkStridesSetEwsAndOrder(newShapeBuffer);
|
||||
shape::checkStridesEwsAndOrder(newShapeBuffer);
|
||||
|
||||
delete[] indices;
|
||||
|
||||
|
@ -4092,7 +4092,7 @@ INLINEDEF _CUDA_HD bool reshapeC(const Nd4jLong* oldShapeInfo, Nd4jLong* newShap
|
|||
|
||||
// set ews
|
||||
if(oldEws == 0)
|
||||
shape::checkStridesSetEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order
|
||||
shape::checkStridesEwsAndOrder(newShapeInfo, newOrder, newNumOfNonUnities, newShape, newStrides); // set ews and order
|
||||
else {
|
||||
newShapeInfo[2 * newRank + 3] = oldOrder; // order
|
||||
*shape::ews(newShapeInfo) = oldEws; // ews
|
||||
|
@ -4642,7 +4642,7 @@ INLINEDEF void calcOffsets(const int rank, const Nd4jLong* shape, const Nd4jLong
|
|||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
|
||||
INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo) {
|
||||
|
||||
// FIXME - indeed we don't need to allocate so large memory amount (2*MAX_RANK), sufficient amount is (2*oldNumOfNonUnities + 2*newNumOfNonUnities)
|
||||
Nd4jLong tempBuffer[2*MAX_RANK];
|
||||
|
@ -4651,11 +4651,11 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo) {
|
|||
// exclude unities from shapeInfo
|
||||
const int numOfNonUnities = shape::excludeUnitiesFromShapeInfo(shapeInfo, shape, strides);
|
||||
|
||||
shape::checkStridesSetEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
|
||||
shape::checkStridesEwsAndOrder(shapeInfo, shape::order(shapeInfo), numOfNonUnities, shape, strides);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
|
||||
INLINEDEF void _CUDA_HD checkStridesEwsAndOrder(Nd4jLong* shapeInfo, const char proposedOrder, const int numOfNonUnities, const Nd4jLong* shapeNoUnities, const Nd4jLong* stridesNoUnities) {
|
||||
|
||||
const int rank = shape::rank(shapeInfo);
|
||||
|
||||
|
@ -4673,19 +4673,32 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
|
|||
|
||||
bool contiguous = true;
|
||||
|
||||
// *** check whether strides are in c contiguous order ***//
|
||||
if(stridesNoUnities[numOfNonUnities - 1] != 1) // last stride should be always unity for c order
|
||||
contiguous = false;
|
||||
else {
|
||||
for (uint i = 0; i < numOfNonUnities - 1; ++i) {
|
||||
if(stridesNoUnities[i] != stridesNoUnities[i + 1] * shapeNoUnities[i + 1]) {
|
||||
contiguous = false;
|
||||
break;
|
||||
}
|
||||
//*** check whether strides are in c contiguous order ***//
|
||||
for (uint i = 0; i < numOfNonUnities - 1; ++i) {
|
||||
if(stridesNoUnities[i] != shapeNoUnities[i + 1] * stridesNoUnities[i + 1]) {
|
||||
contiguous = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(contiguous) {
|
||||
*shape::ews(shapeInfo) = 1;
|
||||
|
||||
// for example we have shapeInfo = {3, 5,1,1, 4,4,1, ...} then we should change it to shapeInfo = {3, 5,1,1, 4,4,4, ...ews=4}
|
||||
if(numOfNonUnities < rank) { // unities are present in shape
|
||||
|
||||
int indNonUnit = rank - 1;
|
||||
|
||||
while(shape::shapeOf(shapeInfo)[indNonUnit--] == 1)
|
||||
|
||||
for(int j = indNonUnit + 2; j < rank; ++j)
|
||||
shape::stride(shapeInfo)[j] = stridesNoUnities[numOfNonUnities - 1];
|
||||
|
||||
for(int j = indNonUnit; j >= 0; --j)
|
||||
if(shape::shapeOf(shapeInfo)[j] == 1)
|
||||
shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j + 1] * shape::stride(shapeInfo)[j + 1];
|
||||
}
|
||||
|
||||
*shape::ews(shapeInfo) = stridesNoUnities[numOfNonUnities - 1];
|
||||
shapeInfo[rank * 2 + 3] = 99;
|
||||
return;
|
||||
}
|
||||
|
@ -4693,18 +4706,31 @@ INLINEDEF void _CUDA_HD checkStridesSetEwsAndOrder(Nd4jLong* shapeInfo, const ch
|
|||
contiguous = true;
|
||||
|
||||
//*** check whether strides are in f contiguous order ***//
|
||||
if(stridesNoUnities[0] != 1) // first stride should be always unity for f order
|
||||
contiguous = false;
|
||||
else {
|
||||
for (uint i = 1; i < numOfNonUnities; ++i) {
|
||||
if(stridesNoUnities[i] != stridesNoUnities[i - 1] * shapeNoUnities[i - 1]) {
|
||||
contiguous = false;
|
||||
break;
|
||||
}
|
||||
for (uint i = 1; i < numOfNonUnities; ++i) {
|
||||
if(stridesNoUnities[i] != shapeNoUnities[i - 1] * stridesNoUnities[i - 1]) {
|
||||
contiguous = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(contiguous) {
|
||||
*shape::ews(shapeInfo) = 1;
|
||||
|
||||
// for example we have shapeInfo = {3, 1,1,5, 1,4,4, ...} then we should change it to shapeInfo = {3, 1,1,5, 4,4,4, ...ews=4}
|
||||
if(numOfNonUnities < rank) { // unities are present in shape
|
||||
|
||||
int indNonUnit = 0;
|
||||
|
||||
while(shape::shapeOf(shapeInfo)[indNonUnit++] == 1)
|
||||
|
||||
for(int j = 0; j < indNonUnit - 1; ++j)
|
||||
shape::stride(shapeInfo)[j] = stridesNoUnities[0];
|
||||
|
||||
for(int j = indNonUnit; j < rank; ++j)
|
||||
if(shape::shapeOf(shapeInfo)[j] == 1)
|
||||
shape::stride(shapeInfo)[j] = shape::shapeOf(shapeInfo)[j - 1] * shape::stride(shapeInfo)[j - 1];
|
||||
}
|
||||
|
||||
*shape::ews(shapeInfo) = stridesNoUnities[0];
|
||||
shapeInfo[rank * 2 + 3] = 102;
|
||||
return;
|
||||
}
|
||||
|
@ -4756,7 +4782,7 @@ INLINEDEF _CUDA_HD void calcSubArrShapeAndOffsets(const Nd4jLong* wholeShapeInfo
|
|||
shape::calcOffsets(dimsSize, shape, strides, subArrOffsets);
|
||||
|
||||
// evaluate ews
|
||||
shape::checkStridesSetEwsAndOrder(subArrShapeInfo);
|
||||
shape::checkStridesEwsAndOrder(subArrShapeInfo);
|
||||
|
||||
delete []strides;
|
||||
delete []shape;
|
||||
|
|
|
@ -42,8 +42,8 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
|
|||
std::vector<int> arrsToDelete;
|
||||
int index = 0;
|
||||
bool allOfSameType = true;
|
||||
auto theFirstRank = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
|
||||
auto theFirstDatatype = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
|
||||
auto rankOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->rankOf() : 0;
|
||||
auto typeOfFirstArr = block.width() > 0 ? INPUT_VARIABLE(0)->dataType() : block.dataType();
|
||||
|
||||
for(int i = 0; i < numOfInArrs; ++i) {
|
||||
auto input = INPUT_VARIABLE(i);
|
||||
|
@ -51,10 +51,10 @@ CUSTOM_OP_IMPL(concat, -1, 1, false, 0, 0) {
|
|||
|
||||
// TODO: follow two lines are in accordance to current tf.concat spec. Commented for compatibility with legacy
|
||||
// REQUIRE_TRUE(currentRank > 0, 0, "Rank of input variable %i must be greater 0, but is %lld instead.", i, currentRank);
|
||||
// REQUIRE_TRUE(theFirstRank == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, theFirstRank);
|
||||
// REQUIRE_TRUE(rankOfFirstArr == currentRank, 0, "Number of dimensions in concat should be equals, but for %i input variable %lld != %lld appears.", i, currentRank, rankOfFirstArr);
|
||||
if(!input->isEmpty()) {
|
||||
|
||||
allOfSameType &= (theFirstDatatype == input->dataType());
|
||||
allOfSameType &= (typeOfFirstArr == input->dataType());
|
||||
|
||||
if(input->rankOf() == 0) {
|
||||
auto vec = new NDArray('c', {1}, input->dataType(), block.launchContext());
|
||||
|
|
|
@ -85,38 +85,106 @@ BUILD_SINGLE_TEMPLATE(template void concatCudaLauncher, (const int blocksPerGrid
|
|||
//////////////////////////////////////////////////////////////////////////
|
||||
void concat(nd4j::LaunchContext * context, const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||
|
||||
const int threadsPerBlock = 256;
|
||||
const int blocksPerGrid = 512;
|
||||
const int sharedMem = 512;
|
||||
const int numOfInArrs = inArrs.size();
|
||||
const auto sizeofT = output.sizeOfT();
|
||||
|
||||
const int numOfArrs = inArrs.size();
|
||||
|
||||
for(int i = 0; i < numOfArrs; ++i)
|
||||
for(int i = 0; i < numOfInArrs; ++i)
|
||||
inArrs[i]->syncToDevice();
|
||||
|
||||
output.syncToDevice();
|
||||
|
||||
// prepare arrays of pointers on buffers and shapes
|
||||
std::vector<void*> hInBuffers(numOfArrs);
|
||||
std::vector<Nd4jLong*> hInShapeInfo(numOfArrs);
|
||||
bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
|
||||
|
||||
for(int i = 0; i < numOfArrs; ++i) {
|
||||
hInBuffers[i] = inArrs[i]->getSpecialBuffer();
|
||||
hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
|
||||
if(luckCase1) {
|
||||
for (uint i = 0; i < numOfInArrs; ++i) {
|
||||
luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
|
||||
if(!luckCase1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
PointersManager manager(context, "helpers::concat");
|
||||
if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
|
||||
|
||||
void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*));
|
||||
void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*));
|
||||
void* z = static_cast<int8_t*>(output.getSpecialBuffer());
|
||||
|
||||
BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
|
||||
for (uint i = 0; i < numOfInArrs; ++i) {
|
||||
const auto memAmountToCopy = inArrs[i]->lengthOf() * sizeofT;
|
||||
cudaMemcpyAsync(z, static_cast<int8_t*>(inArrs[i]->getSpecialBuffer()), memAmountToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
|
||||
z = static_cast<int8_t*>(z) + memAmountToCopy;
|
||||
}
|
||||
|
||||
manager.synchronize();
|
||||
if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
|
||||
throw std::runtime_error("concat cuda: luckCase1 failed!");
|
||||
|
||||
for(int i = 0; i < numOfArrs; ++i)
|
||||
for(int i = 0; i < numOfInArrs; ++i)
|
||||
inArrs[i]->tickReadDevice();
|
||||
output.tickWriteDevice();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const bool isZcontin = output.strideAt(axis) == 1;
|
||||
bool areInputsContin = true;
|
||||
bool allSameOrder = true;
|
||||
|
||||
if(isZcontin) {
|
||||
for (uint i = 0; i < inArrs.size(); ++i) {
|
||||
areInputsContin &= inArrs[i]->strideAt(axis) == 1;
|
||||
allSameOrder &= output.ordering() == inArrs[i]->ordering();
|
||||
if(!areInputsContin || !allSameOrder)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
|
||||
|
||||
if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
|
||||
|
||||
const uint zDim = output.sizeAt(axis);
|
||||
|
||||
for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
|
||||
|
||||
const auto iShift = i * sizeofT;
|
||||
void* z = static_cast<int8_t*>(output.getSpecialBuffer()) + zDim * iShift;
|
||||
|
||||
for (uint j = 0; j < numOfInArrs; ++j) {
|
||||
const auto xDim = inArrs[j]->sizeAt(axis);
|
||||
void* x = static_cast<int8_t*>(inArrs[j]->getSpecialBuffer()) + xDim * iShift;
|
||||
const auto memSizeToCopy = xDim * sizeofT;
|
||||
cudaMemcpyAsync(z, x, memSizeToCopy, cudaMemcpyDeviceToDevice, *context->getCudaStream());
|
||||
z = static_cast<int8_t*>(z) + memSizeToCopy;
|
||||
}
|
||||
}
|
||||
|
||||
if(cudaStreamSynchronize(*context->getCudaStream()) != 0)
|
||||
throw std::runtime_error("concat cuda: luckCase2 failed!");
|
||||
}
|
||||
else { // general (slower) case
|
||||
|
||||
const int threadsPerBlock = 256;
|
||||
const int blocksPerGrid = 512;
|
||||
const int sharedMem = 512;
|
||||
|
||||
// prepare arrays of pointers on buffers and shapes
|
||||
std::vector<void*> hInBuffers(numOfInArrs);
|
||||
std::vector<Nd4jLong*> hInShapeInfo(numOfInArrs);
|
||||
|
||||
for(int i = 0; i < numOfInArrs; ++i) {
|
||||
hInBuffers[i] = inArrs[i]->getSpecialBuffer();
|
||||
hInShapeInfo[i] = inArrs[i]->getSpecialShapeInfo();
|
||||
}
|
||||
|
||||
PointersManager manager(context, "helpers::concat");
|
||||
|
||||
void* dInBuffers = manager.replicatePointer(hInBuffers.data(), hInBuffers.size() * sizeof(void*));
|
||||
void* dInShapeInfo = manager.replicatePointer(hInShapeInfo.data(), hInShapeInfo.size() * sizeof(Nd4jLong*));
|
||||
|
||||
BUILD_SINGLE_SELECTOR(inArrs[0]->dataType(), concatCudaLauncher, (blocksPerGrid, threadsPerBlock, sharedMem, context->getCudaStream(), dInBuffers, dInShapeInfo, output.specialBuffer(), output.specialShapeInfo(), axis), LIBND4J_TYPES);
|
||||
|
||||
manager.synchronize();
|
||||
}
|
||||
|
||||
for(int i = 0; i < numOfInArrs; ++i)
|
||||
inArrs[i]->tickReadDevice();
|
||||
|
||||
output.tickWriteDevice();
|
||||
}
|
||||
|
||||
|
|
|
@ -31,81 +31,170 @@
|
|||
#include <helpers/Loops.h>
|
||||
|
||||
namespace nd4j {
|
||||
|
||||
/**
|
||||
* Concatneate multi array of the same shape together
|
||||
* along a particular dimension
|
||||
*/
|
||||
// template <typename T>
|
||||
// void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||
// const uint numOfArrs = inArrs.size();
|
||||
|
||||
// int outDim;
|
||||
// const bool isOutputVector = output.isCommonVector(outDim);
|
||||
|
||||
// if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
|
||||
|
||||
// bool allVectorsOrScalars = true;
|
||||
// const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
|
||||
|
||||
// std::vector<int> nonUnityDim(numOfArrs);
|
||||
// std::vector<Nd4jLong> zOffset(numOfArrs);
|
||||
|
||||
// for(int i = 0; i < numOfArrs; i++) {
|
||||
// allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
|
||||
// if(!allVectorsOrScalars)
|
||||
// break;
|
||||
// if(i == 0) zOffset[0] = 0;
|
||||
// else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
|
||||
// }
|
||||
|
||||
// if(allVectorsOrScalars) {
|
||||
|
||||
// T* outBuff = output.bufferAsT<T>();
|
||||
|
||||
// auto func = PRAGMA_THREADS_FOR {
|
||||
// for (auto r = start; r < stop; r += increment) {
|
||||
// const Nd4jLong arrLen = inArrs[r]->lengthOf();
|
||||
// const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
|
||||
|
||||
// T *z = outBuff + zOffset[r];
|
||||
// T *x = inArrs[r]->bufferAsT<T>();
|
||||
|
||||
// if (outEws == 1 && xEws == 1)
|
||||
// for (Nd4jLong e = 0; e < arrLen; e++)
|
||||
// z[e] = x[e];
|
||||
// else
|
||||
// for (Nd4jLong e = 0; e < arrLen; e++)
|
||||
// z[e * outEws] = x[e * xEws];
|
||||
// }
|
||||
// };
|
||||
|
||||
// samediff::Threads::parallel_tad(func, 0, numOfArrs);
|
||||
// return;
|
||||
// }
|
||||
// }
|
||||
|
||||
// const int rank = inArrs[0]->rankOf();
|
||||
// const int rank2 = 2*rank;
|
||||
// std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
|
||||
|
||||
// // take into account indices for first array
|
||||
// indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
|
||||
|
||||
// // loop through the rest of input arrays
|
||||
// for(int i = 1; i < numOfArrs; ++i) {
|
||||
// indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from
|
||||
// indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding)
|
||||
// }
|
||||
|
||||
// auto func = PRAGMA_THREADS_FOR {
|
||||
// for (auto i = start; i < stop; i += increment) {
|
||||
// auto temp = output(indices[i], true);
|
||||
// nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
|
||||
// }
|
||||
// };
|
||||
|
||||
// samediff::Threads::parallel_tad(func, 0, numOfArrs);
|
||||
// }
|
||||
|
||||
template <typename T>
|
||||
void SpecialMethods<T>::concatCpuGeneric(const std::vector<NDArray*>& inArrs, NDArray& output, const int axis) {
|
||||
const uint numOfArrs = inArrs.size();
|
||||
|
||||
int outDim;
|
||||
const bool isOutputVector = output.isCommonVector(outDim);
|
||||
const int numOfInArrs = inArrs.size();
|
||||
const auto sizeofT = output.sizeOfT();
|
||||
|
||||
if(isOutputVector || (axis == 0 && output.ordering() == 'c')) {
|
||||
T* zBuff = output.bufferAsT<T>();
|
||||
|
||||
bool allVectorsOrScalars = true;
|
||||
const uint outEws = isOutputVector ? output.stridesOf()[outDim] : output.ews();
|
||||
bool luckCase1 = ((axis == 0 && output.ordering() == 'c') || (axis == output.rankOf() - 1 && output.ordering() == 'f')) && output.ews() == 1;
|
||||
|
||||
std::vector<int> nonUnityDim(numOfArrs);
|
||||
std::vector<Nd4jLong> zOffset(numOfArrs);
|
||||
if(luckCase1) {
|
||||
for (uint i = 0; i < numOfInArrs; ++i) {
|
||||
luckCase1 &= inArrs[i]->ordering() == output.ordering() && inArrs[i]->ews() == 1;
|
||||
if(!luckCase1)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for(int i = 0; i < numOfArrs; i++) {
|
||||
allVectorsOrScalars &= (inArrs[i]->lengthOf() == 1 || inArrs[i]->isCommonVector(nonUnityDim[i]));
|
||||
if(!allVectorsOrScalars)
|
||||
break;
|
||||
if(i == 0) zOffset[0] = 0;
|
||||
else zOffset[i] = zOffset[i - 1] + outEws * inArrs[i - 1]->lengthOf();
|
||||
}
|
||||
if(luckCase1) { // for example {1,10} + {2,10} + {3,10} = {6, 10} order c; or {10,1} + {10,2} + {10,3} = {10, 6} order f
|
||||
|
||||
if(allVectorsOrScalars) {
|
||||
T* z = zBuff;
|
||||
for (uint i = 0; i < numOfInArrs; ++i) {
|
||||
const auto memAmountToCopy = inArrs[i]->lengthOf();
|
||||
memcpy(z, inArrs[i]->bufferAsT<T>(), memAmountToCopy * sizeofT);
|
||||
z += memAmountToCopy;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
T* outBuff = output.bufferAsT<T>();
|
||||
const bool isZcontin = output.strideAt(axis) == 1 && output.ordering() == 'c';
|
||||
bool areInputsContin = true;
|
||||
bool allSameOrder = true;
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto r = start; r < stop; r++) {
|
||||
const Nd4jLong arrLen = inArrs[r]->lengthOf();
|
||||
const uint xEws = (arrLen == 1) ? 1 : inArrs[r]->stridesOf()[nonUnityDim[r]];
|
||||
if(isZcontin) {
|
||||
for (uint i = 0; i < numOfInArrs; ++i) {
|
||||
areInputsContin &= inArrs[i]->strideAt(axis) == 1;
|
||||
allSameOrder &= inArrs[i]->ordering() == output.ordering();
|
||||
if(!areInputsContin || !allSameOrder)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
T *z = outBuff + zOffset[r];
|
||||
T *x = inArrs[r]->bufferAsT<T>();
|
||||
const bool luckCase2 = isZcontin && areInputsContin && allSameOrder;
|
||||
|
||||
if (outEws == 1 && xEws == 1)
|
||||
for (Nd4jLong e = 0; e < arrLen; e++)
|
||||
z[e] = x[e];
|
||||
else
|
||||
for (Nd4jLong e = 0; e < arrLen; e++)
|
||||
z[e * outEws] = x[e * xEws];
|
||||
}
|
||||
};
|
||||
if(luckCase2) { // for example {2,1,3} + {2,5,3} + {2,10,3} = {2,16,3}, here axis 1 shoud have stride = 1 for all inputs arrays and output array
|
||||
|
||||
samediff::Threads::parallel_tad(func, 0, numOfArrs);
|
||||
return;
|
||||
const uint zDim = output.sizeAt(axis);
|
||||
|
||||
for (uint i = 0; i < output.lengthOf() / zDim; ++i) {
|
||||
T* z = zBuff + zDim * i;
|
||||
|
||||
for (uint j = 0; j < inArrs.size(); ++j) {
|
||||
const auto xDim = inArrs[j]->sizeAt(axis);
|
||||
const T* x = inArrs[j]->bufferAsT<T>() + xDim * i;
|
||||
memcpy(z, x, xDim * sizeofT);
|
||||
z += xDim;
|
||||
}
|
||||
}
|
||||
|
||||
const int rank = inArrs[0]->rankOf();
|
||||
const int rank2 = 2*rank;
|
||||
std::vector<std::vector<Nd4jLong>> indices(numOfArrs, std::vector<Nd4jLong>(rank2,0));
|
||||
return;
|
||||
}
|
||||
|
||||
// take into account indices for first array
|
||||
indices[0][2 * axis + 1] = inArrs[0]->sizeAt(axis);
|
||||
// general case
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
|
||||
// loop through the rest of input arrays
|
||||
for(int i = 1; i < numOfArrs; ++i) {
|
||||
indices[i][2 * axis] = indices[i-1][2 * axis + 1]; // index start from
|
||||
indices[i][2 * axis + 1] = indices[i-1][2 * axis + 1] + inArrs[i]->sizeAt(axis); // index end with (excluding)
|
||||
}
|
||||
Nd4jLong coords[MAX_RANK];
|
||||
|
||||
auto func = PRAGMA_THREADS_FOR {
|
||||
for (auto i = start; i < stop; i++) {
|
||||
auto temp = output(indices[i], true);
|
||||
nd4j::TransformLoops<T, T, T>::template loopTransform<simdOps::Assign<T, T>>( inArrs[i]->bufferAsT<T>(), inArrs[i]->getShapeInfo(), temp.bufferAsT<T>(), temp.getShapeInfo(), nullptr, 0, 1);
|
||||
for (auto i = start; i < stop; i += increment) {
|
||||
|
||||
shape::index2coords(i, output.getShapeInfo(), coords);
|
||||
const auto zOffset = shape::getOffset(output.getShapeInfo(), coords);
|
||||
|
||||
uint inArrIdx = 0;
|
||||
uint xDim = inArrs[inArrIdx]->sizeAt(axis);
|
||||
|
||||
while (coords[axis] >= xDim) {
|
||||
coords[axis] -= xDim;
|
||||
xDim = inArrs[++inArrIdx]->sizeAt(axis);
|
||||
}
|
||||
};
|
||||
|
||||
samediff::Threads::parallel_tad(func, 0, numOfArrs);
|
||||
const T* x = inArrs[inArrIdx]->bufferAsT<T>();
|
||||
const auto xOffset = shape::getOffset(inArrs[inArrIdx]->getShapeInfo(), coords);
|
||||
|
||||
zBuff[zOffset] = x[xOffset];
|
||||
}
|
||||
};
|
||||
|
||||
samediff::Threads::parallel_for(func, 0, output.lengthOf());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -128,6 +217,7 @@ void SpecialMethods<T>::concatCpuGeneric(int dimension, int numArrays, Nd4jPoint
|
|||
delete inputs[i];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This kernel accumulates X arrays, and stores result into Z
|
||||
*
|
||||
|
|
|
@ -300,6 +300,8 @@ TEST_F(DeclarableOpsTests9, concat_test3) {
|
|||
ASSERT_EQ(ND4J_STATUS_OK, result->status());
|
||||
auto output = result->at(0);
|
||||
|
||||
output->printBuffer();
|
||||
|
||||
ASSERT_TRUE(exp.isSameShape(output));
|
||||
ASSERT_TRUE(exp.equalsTo(output));
|
||||
|
||||
|
@ -620,12 +622,12 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
|
|||
|
||||
// we crate bunch of arrays, filled with specific values
|
||||
for (int e = 0; e < 2000; e++) {
|
||||
auto array = NDArrayFactory::create_<float>('c', {1, 300});
|
||||
auto array = NDArrayFactory::create_<int>('c', {1, 300});
|
||||
array->assign(e);
|
||||
context.setInputArray(e, array, true);
|
||||
}
|
||||
|
||||
auto z = NDArrayFactory::create<float>('c', {2000, 300});
|
||||
auto z = NDArrayFactory::create<int>('c', {2000, 300});
|
||||
context.setOutputArray(0, &z, false);
|
||||
context.setIArguments(&axis, 1);
|
||||
|
||||
|
@ -633,8 +635,10 @@ TEST_F(DeclarableOpsTests9, concat_test18) {
|
|||
op.execute(&context);
|
||||
|
||||
for (int e = 0; e < 2000; e++) {
|
||||
auto exp = NDArrayFactory::create<int>('c', {300});
|
||||
exp.assign(e);
|
||||
auto row = z.tensorAlongDimension(e, {1});
|
||||
ASSERT_NEAR((float) e, row.e<float>(0), 1e-5f);
|
||||
ASSERT_EQ(exp, row);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -956,7 +956,7 @@ TEST_F(NDArrayTest2, subarray_1) {
|
|||
float buffExpX3[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
|
||||
Nd4jLong shapeExpX4[] = {3, 2, 1, 4, 12, 4, 1, 8192, 0, 99};
|
||||
float buffExpX4[] = {9.000000, 10.000000, 11.000000, 12.000000, 21.000000, 22.000000, 23.000000, 24.000000};
|
||||
Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 0, 99};
|
||||
Nd4jLong shapeExpX5[] = {2, 2, 3, 12, 4, 8192, 4, 99};
|
||||
float buffExpX5[] = {4.000000, 8.000000, 12.000000, 16.000000, 20.000000, 24.000000};
|
||||
|
||||
Nd4jLong shapeExpY0[] = {1, 2, 1, 8192, 1, 102};
|
||||
|
|
|
@ -196,7 +196,7 @@ public:
|
|||
int dimensionLength = 2;
|
||||
int dimension[2] = {2,3};
|
||||
Nd4jLong tadAssertionC[10] = {3,4,4,1,4,1,16,16384,1,99};
|
||||
Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,1,16384,1,102};
|
||||
Nd4jLong tadCAssertionF[10] = {3,4,4,1,1,4,16,16384,1,102};
|
||||
};
|
||||
|
||||
|
||||
|
@ -222,7 +222,7 @@ TEST_F(LeadingOnes,OnesTest) {
|
|||
|
||||
class NormalThreeFourFive : public testing::Test {
|
||||
public:
|
||||
Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 0, 102};
|
||||
Nd4jLong assertionBuffer[8] = {2, 3, 4, 20, 5, 16384, 5, 99};
|
||||
Nd4jLong inputShapeBuffer[10] = {3,3,4,5,20,5,1,16384,1,99};
|
||||
int dimensionLength = 2;
|
||||
int dimension[2] = {0,1};
|
||||
|
|
|
@ -7742,18 +7742,18 @@ public static final int PREALLOC_SIZE = 33554432;
|
|||
* @return the double at the specified index
|
||||
*/
|
||||
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer indices);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer indices);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] indices);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer indices);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer indices);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] indices);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Cast("const Nd4jLong*") LongPointer coords);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Cast("const Nd4jLong*") LongBuffer coords);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Cast("const Nd4jLong*") long[] coords);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongPointer shapeInfo, @Const IntPointer coords);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") LongBuffer shapeInfo, @Const IntBuffer coords);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords, @Cast("Nd4jLong") long baseOffset/*=0*/);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong") long getOffset(@Cast("const Nd4jLong*") long[] shapeInfo, @Const int[] coords);
|
||||
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong*") LongPointer createShapeInfo(@Cast("Nd4jLong*") LongPointer shape, @Cast("Nd4jLong*") LongPointer stride, int rank);
|
||||
@Namespace("shape") public static native @Cast("Nd4jLong*") LongBuffer createShapeInfo(@Cast("Nd4jLong*") LongBuffer shape, @Cast("Nd4jLong*") LongBuffer stride, int rank);
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
package org.nd4j.linalg.shape.concat;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import lombok.val;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
@ -43,6 +44,7 @@ import static org.junit.Assert.assertTrue;
|
|||
/**
|
||||
* @author Adam Gibson
|
||||
*/
|
||||
@Slf4j
|
||||
@RunWith(Parameterized.class)
|
||||
public class ConcatTestsC extends BaseNd4jTest {
|
||||
|
||||
|
@ -309,7 +311,11 @@ public class ConcatTestsC extends BaseNd4jTest {
|
|||
for (int e = 0; e < 20000; e++)
|
||||
list.add(Nd4j.create(DataType.INT, 1, 300).assign(e));
|
||||
|
||||
val timeStart = System.nanoTime();
|
||||
val result = Nd4j.concat(0, list.toArray(new INDArray[list.size()]));
|
||||
val timeEnd = System.nanoTime();
|
||||
|
||||
log.info("Time: {} us", (timeEnd - timeStart) / 1000);
|
||||
|
||||
for (int e = 0; e < 20000; e++)
|
||||
assertEquals((float) e, result.getRow(e).meanNumber().floatValue(), 1e-5f);
|
||||
|
|
Loading…
Reference in New Issue