[WIP] gatherND fix (#176)
* one test for gather_nd Signed-off-by: raver119 <raver119@gmail.com> * get rid of old concat tests Signed-off-by: raver119 <raver119@gmail.com> * one printf Signed-off-by: raver119 <raver119@gmail.com> * one more legacy test removed Signed-off-by: raver119 <raver119@gmail.com> * gatherNd launch params fix Signed-off-by: raver119 <raver119@gmail.com> * gatherNd launch params fix Signed-off-by: raver119 <raver119@gmail.com>master
parent
5cfbeb64ac
commit
efbfafe3f7
|
@ -106,6 +106,7 @@ namespace nd4j {
|
|||
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, xCoordStart, xRank);
|
||||
|
||||
z[zOffset] = x[xOffset];
|
||||
printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -124,7 +125,7 @@ namespace nd4j {
|
|||
|
||||
const int maxRank = nd4j::math::nd4j_max<int>(indices.rankOf(), nd4j::math::nd4j_max<int>(input.rankOf(), output.rankOf()));
|
||||
|
||||
const int threadsPerBlock = MAX_NUM_THREADS;
|
||||
const int threadsPerBlock = 256;
|
||||
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
|
||||
const int sharedMem = 8 * threadsPerBlock * maxRank + 128;
|
||||
|
||||
|
|
|
@ -815,6 +815,23 @@ TEST_F(DeclarableOpsTests5, gatherNd_test7) {
|
|||
delete results;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
TEST_F(DeclarableOpsTests5, gatherNd_test8) {
|
||||
auto x = NDArrayFactory::create<double>('c', {2, 2}, {1., 2., 3., 4.});
|
||||
auto y = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 1, 1});
|
||||
auto e = NDArrayFactory::create<double>('c', {2}, {1., 4.});
|
||||
|
||||
nd4j::ops::gather_nd op;
|
||||
auto result = op.execute({&x, &y}, {}, {});
|
||||
ASSERT_EQ(Status::OK(), result->status());
|
||||
|
||||
auto z = result->at(0);
|
||||
|
||||
ASSERT_EQ(e, *z);
|
||||
|
||||
delete result;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
TEST_F(DeclarableOpsTests5, reverse_sequense_test1) {
|
||||
|
||||
|
|
|
@ -2261,304 +2261,4 @@ TEST_F(NDArrayCudaBasicsTests, Test_Empty_4) {
|
|||
|
||||
ASSERT_TRUE(x->isEmpty());
|
||||
delete x;
|
||||
}
|
||||
|
||||
// printCudaGlobal<double><<<1,1,0,*stream>>>(dX, 6);
|
||||
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(dXShapeInfo, 8);
|
||||
// printCudaGlobal<double><<<1,1,0,*stream>>>(dZ, 2);
|
||||
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(dZShapeInfo, 6);
|
||||
// printCudaGlobal<int><<<1,1,0,*stream>>>(dimension, 1);
|
||||
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(tadShapeInfo, 6);
|
||||
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(tadOffsets, 2);
|
||||
// cudaStreamSynchronize(*stream);
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_1) {
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {5,2}, {0,1,2,3,4,5,6,7,8,9});
|
||||
x.syncToHost();
|
||||
auto z = NDArrayFactory::create<float>('c', {5, 8});
|
||||
z.syncToHost();
|
||||
|
||||
std::vector<void*> buffers(4);
|
||||
std::vector<Nd4jLong*> shapes(4);
|
||||
std::vector<Nd4jLong*> hostShapes(4);
|
||||
|
||||
for (size_t i = 0; i < buffers.size(); i++) {
|
||||
buffers[i] = x.specialBuffer();
|
||||
shapes[i] = x.specialShapeInfo();
|
||||
hostShapes[i] = x.shapeInfo();
|
||||
}
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x.getContext()->getCudaStream();
|
||||
::concat(extra, 1, 4, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_2) {
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {5,2}, {0,1,2,3,4,5,6,7,8,9});
|
||||
auto z = NDArrayFactory::create<float>('f', {5, 8});
|
||||
|
||||
std::vector<void*> buffers(4);
|
||||
std::vector<Nd4jLong*> shapes(4);
|
||||
std::vector<Nd4jLong*> hostShapes(4);
|
||||
|
||||
x.syncToHost();
|
||||
z.syncToHost();
|
||||
|
||||
for (size_t i = 0; i < buffers.size(); i++) {
|
||||
buffers[i] = x.specialBuffer();
|
||||
shapes[i] = x.specialShapeInfo();
|
||||
hostShapes[i] = x.shapeInfo();
|
||||
}
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x.getContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 1, 4, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_3) {
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {2,3}, {1,2,3,4,5,6});
|
||||
auto y = NDArrayFactory::create<float>('c', {1,3}, {7,8,9});
|
||||
auto z = NDArrayFactory::create<float>('f', {3, 3});
|
||||
|
||||
|
||||
std::vector<void*> buffers(2);
|
||||
std::vector<Nd4jLong*> shapes(2);
|
||||
std::vector<Nd4jLong*> hostShapes(2);
|
||||
|
||||
x.syncToHost();
|
||||
y.syncToHost();
|
||||
z.syncToHost();
|
||||
|
||||
buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo();
|
||||
buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo();
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x.getContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_4) {
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {2,3}, {1,2,3,4,5,6});
|
||||
auto y = NDArrayFactory::create<float>('c', {1,3}, {7,8,9});
|
||||
auto z = NDArrayFactory::create<float>('c', {3, 3});
|
||||
|
||||
x.syncToHost();
|
||||
y.syncToHost();
|
||||
z.syncToHost();
|
||||
|
||||
std::vector<void*> buffers(2);
|
||||
std::vector<Nd4jLong*> shapes(2);
|
||||
std::vector<Nd4jLong*> hostShapes(2);
|
||||
|
||||
buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo();
|
||||
buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo();
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x.getContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_5) {
|
||||
|
||||
auto x = NDArrayFactory::create<float>('c', {1,2,3}, {1,2,3,4,5,6});
|
||||
auto y = NDArrayFactory::create<float>('c', {1,2,3}, {7,8,9,10,11, 12});
|
||||
|
||||
auto z = NDArrayFactory::create<float>('c', {2, 2, 3});
|
||||
auto stream = x.getContext()->getCudaStream();//reinterpret_cast<cudaStream_t *>(&nativeStream);
|
||||
std::vector<void*> buffers(2);
|
||||
std::vector<Nd4jLong*> shapes(2);
|
||||
std::vector<Nd4jLong*> hostShapes(2);
|
||||
|
||||
buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo();
|
||||
buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo();
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x.getContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_6) {
|
||||
|
||||
auto x1 = NDArrayFactory::create<float>('c', {2,2,3}, {1,2,3,4,5,6,7,8, 9, 10,11,12});
|
||||
auto x2 = NDArrayFactory::create<float>('c', {1,2,3}, {13,14,15,16,17, 18});
|
||||
auto x3 = NDArrayFactory::create<float>('c', {1,2,3}, {19,20,21,22,23, 24});
|
||||
|
||||
x1.syncToHost();
|
||||
x2.syncToHost();
|
||||
x3.syncToHost();
|
||||
|
||||
auto z = NDArrayFactory::create<float>('c', {4, 2, 3});
|
||||
|
||||
std::vector<void*> buffers(3);
|
||||
std::vector<Nd4jLong*> shapes(3);
|
||||
std::vector<Nd4jLong*> hostShapes(3);
|
||||
|
||||
buffers[0] = x1.specialBuffer(); shapes[0] = x1.specialShapeInfo(); hostShapes[0] = x1.shapeInfo();
|
||||
buffers[1] = x2.specialBuffer(); shapes[1] = x2.specialShapeInfo(); hostShapes[1] = x2.shapeInfo();
|
||||
buffers[2] = x3.specialBuffer(); shapes[2] = x3.specialShapeInfo(); hostShapes[2] = x3.shapeInfo();
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x1.getContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 0, 3, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_7) {
|
||||
|
||||
auto x1 = NDArrayFactory::create<float>(1);
|
||||
auto x2 = NDArrayFactory::create<float>(2);
|
||||
auto x3 = NDArrayFactory::create<float>(3);
|
||||
|
||||
auto z = NDArrayFactory::create<float>('c', {3}, {1,2,3});
|
||||
|
||||
x1.syncToHost();
|
||||
x2.syncToHost();
|
||||
x3.syncToHost();
|
||||
|
||||
std::vector<void*> buffers(3);
|
||||
std::vector<Nd4jLong*> shapes(3);
|
||||
std::vector<Nd4jLong*> hostShapes(3);
|
||||
|
||||
buffers[0] = x1.specialBuffer(); shapes[0] = x1.specialShapeInfo(); hostShapes[0] = x1.shapeInfo();
|
||||
buffers[1] = x2.specialBuffer(); shapes[1] = x2.specialShapeInfo(); hostShapes[1] = x2.shapeInfo();
|
||||
buffers[2] = x3.specialBuffer(); shapes[2] = x3.specialShapeInfo(); hostShapes[2] = x3.shapeInfo();
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = x1.getContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 0, 3, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_8) {
|
||||
|
||||
auto totalCount = 1000;
|
||||
auto width = 300;
|
||||
std::vector<NDArray> lx(totalCount);
|
||||
for (int i = 0; i < totalCount; i++) {
|
||||
lx[i] = NDArrayFactory::create<float>('c', {1, width});
|
||||
lx[i].assign(i);
|
||||
lx[i].syncToHost();
|
||||
}
|
||||
|
||||
auto z = NDArrayFactory::create<float>('c', {totalCount, width});
|
||||
|
||||
std::vector<void*> buffers(totalCount);
|
||||
std::vector<Nd4jLong*> shapes(totalCount);
|
||||
std::vector<Nd4jLong*> hostShapes(totalCount);
|
||||
|
||||
for (size_t i = 0; i < lx.size(); i++) {
|
||||
buffers[i] = lx[i].specialBuffer();
|
||||
shapes[i] = lx[i].specialShapeInfo();
|
||||
hostShapes[i] = lx[i].shapeInfo();
|
||||
}
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = nd4j::LaunchContext::defaultContext()->getCudaStream();
|
||||
|
||||
::concat(extra, 0, totalCount, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, TestTear_1) {
|
||||
auto input = NDArrayFactory::create<float>('c', {1, 10, 10});
|
||||
std::vector<NDArray> arrays; // = {NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10})};
|
||||
int total = 151;
|
||||
for (int e = 0; e < total; e++) {
|
||||
input.assign(e);
|
||||
arrays.emplace_back(input);
|
||||
}
|
||||
auto z = NDArrayFactory::create<float>('c', {total, 10, 10});
|
||||
|
||||
Nd4jPointer extra[1];
|
||||
extra[1] = input.getContext()->getCudaStream();
|
||||
|
||||
std::vector<void*> buffers(total);
|
||||
std::vector<Nd4jLong*> shapes(total);
|
||||
std::vector<Nd4jLong*> hostShapes(total);
|
||||
|
||||
for (size_t i = 0; i < buffers.size(); i++) {
|
||||
buffers[i] = arrays[i].specialBuffer();
|
||||
shapes[i] = arrays[i].specialShapeInfo();
|
||||
hostShapes[i] = arrays[i].shapeInfo();
|
||||
}
|
||||
|
||||
::concat(extra, 0, total, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
nd4j::ops::tear op;
|
||||
|
||||
auto result = op.execute({&z}, {}, {1, 2});
|
||||
//ASSERT_EQ(10, result->size());
|
||||
auto e = result->size() - 1;
|
||||
//for (size_t e = 0; e < result->size(); e++) {
|
||||
// arrays[e].printIndexedBuffer("Input list at 40");
|
||||
// result->at(e)->printIndexedBuffer("OUtput TEAR at 40");
|
||||
//}
|
||||
// ASSERT_TRUE(tads->at(e)->equalsTo(result->at(e)));
|
||||
|
||||
delete result;
|
||||
// delete tads;
|
||||
}
|
||||
|
||||
TEST_F(NDArrayCudaBasicsTests, TestTear_2) {
|
||||
|
||||
auto input = NDArrayFactory::create<float>('c', {1, 10, 10});
|
||||
|
||||
std::vector<NDArray> arrays; // = {NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10})};
|
||||
for (int e = 0; e < 10; e++) {
|
||||
input.assign(e);
|
||||
arrays.emplace_back(input);
|
||||
arrays[e].syncToHost();
|
||||
}
|
||||
|
||||
auto z = NDArrayFactory::create<float>('c', {10, 10, 10});
|
||||
|
||||
Nd4jPointer extra[2];
|
||||
extra[1] = input.getContext()->getCudaStream();
|
||||
|
||||
std::vector<void*> buffers(10);
|
||||
std::vector<Nd4jLong*> shapes(10);
|
||||
std::vector<Nd4jLong*> hostShapes(10);
|
||||
|
||||
for (size_t i = 0; i < buffers.size(); i++) {
|
||||
buffers[i] = arrays[i].specialBuffer();
|
||||
shapes[i] = arrays[i].specialShapeInfo();
|
||||
hostShapes[i] = arrays[i].shapeInfo();
|
||||
}
|
||||
|
||||
std::vector<int> dimsToExclude({1,2});
|
||||
|
||||
|
||||
::concat(extra, 0, 10, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
|
||||
|
||||
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimsToExclude);
|
||||
//std::vector<void*> arraysData(arrays.size());
|
||||
Nd4jPointer* arraysData;
|
||||
cudaError_t err = cudaMalloc(&arraysData, arrays.size() * sizeof(void*));
|
||||
if (err != 0) {
|
||||
printf("Cannot allocate device memory for targets due error %d\n", err);
|
||||
ASSERT_TRUE(false);
|
||||
}
|
||||
for (size_t i = 0; i < arrays.size(); i++) {
|
||||
Nd4jPointer target = arrays[i].specialBuffer();
|
||||
cudaMemcpy(&arraysData[i], &target, sizeof(Nd4jPointer), cudaMemcpyHostToDevice);
|
||||
}
|
||||
::tear(extra, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), arraysData, input.specialShapeInfo(), packX.specialShapeInfo(), packX.specialOffsets());
|
||||
// auto result = op.execute({&z}, {}, {1, 2});
|
||||
|
||||
//ASSERT_EQ(10, result->size());
|
||||
err = cudaFree(arraysData);
|
||||
if (err != 0) {
|
||||
printf("Cannot deallocate device memory for targets due error %d\n", err);
|
||||
ASSERT_TRUE(false);
|
||||
}
|
||||
|
||||
// ASSERT_TRUE(tads->at(e)->equalsTo(result->at(e)));
|
||||
|
||||
// delete result;
|
||||
// delete tads;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue