[WIP] gatherND fix (#176)

* one test for gather_nd

Signed-off-by: raver119 <raver119@gmail.com>

* get rid of old concat tests

Signed-off-by: raver119 <raver119@gmail.com>

* one printf

Signed-off-by: raver119 <raver119@gmail.com>

* one more legacy test removed

Signed-off-by: raver119 <raver119@gmail.com>

* gatherNd launch params fix

Signed-off-by: raver119 <raver119@gmail.com>

* gatherNd launch params fix

Signed-off-by: raver119 <raver119@gmail.com>
master
raver119 2019-08-27 12:35:14 +03:00 committed by GitHub
parent 5cfbeb64ac
commit efbfafe3f7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 20 additions and 302 deletions

View File

@ -106,6 +106,7 @@ namespace nd4j {
const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, xCoordStart, xRank);
z[zOffset] = x[xOffset];
printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]);
}
}
@ -124,7 +125,7 @@ namespace nd4j {
const int maxRank = nd4j::math::nd4j_max<int>(indices.rankOf(), nd4j::math::nd4j_max<int>(input.rankOf(), output.rankOf()));
const int threadsPerBlock = MAX_NUM_THREADS;
const int threadsPerBlock = 256;
const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock;
const int sharedMem = 8 * threadsPerBlock * maxRank + 128;

View File

@ -815,6 +815,23 @@ TEST_F(DeclarableOpsTests5, gatherNd_test7) {
delete results;
}
//////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests5, gatherNd_test8) {
auto x = NDArrayFactory::create<double>('c', {2, 2}, {1., 2., 3., 4.});
auto y = NDArrayFactory::create<int>('c', {2, 2}, {0, 0, 1, 1});
auto e = NDArrayFactory::create<double>('c', {2}, {1., 4.});
nd4j::ops::gather_nd op;
auto result = op.execute({&x, &y}, {}, {});
ASSERT_EQ(Status::OK(), result->status());
auto z = result->at(0);
ASSERT_EQ(e, *z);
delete result;
}
//////////////////////////////////////////////////////////////////////
TEST_F(DeclarableOpsTests5, reverse_sequense_test1) {

View File

@ -2262,303 +2262,3 @@ TEST_F(NDArrayCudaBasicsTests, Test_Empty_4) {
ASSERT_TRUE(x->isEmpty());
delete x;
}
// printCudaGlobal<double><<<1,1,0,*stream>>>(dX, 6);
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(dXShapeInfo, 8);
// printCudaGlobal<double><<<1,1,0,*stream>>>(dZ, 2);
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(dZShapeInfo, 6);
// printCudaGlobal<int><<<1,1,0,*stream>>>(dimension, 1);
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(tadShapeInfo, 6);
// printCudaGlobal<Nd4jLong><<<1,1,0,*stream>>>(tadOffsets, 2);
// cudaStreamSynchronize(*stream);
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_1) {
auto x = NDArrayFactory::create<float>('c', {5,2}, {0,1,2,3,4,5,6,7,8,9});
x.syncToHost();
auto z = NDArrayFactory::create<float>('c', {5, 8});
z.syncToHost();
std::vector<void*> buffers(4);
std::vector<Nd4jLong*> shapes(4);
std::vector<Nd4jLong*> hostShapes(4);
for (size_t i = 0; i < buffers.size(); i++) {
buffers[i] = x.specialBuffer();
shapes[i] = x.specialShapeInfo();
hostShapes[i] = x.shapeInfo();
}
Nd4jPointer extra[2];
extra[1] = x.getContext()->getCudaStream();
::concat(extra, 1, 4, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_2) {
auto x = NDArrayFactory::create<float>('c', {5,2}, {0,1,2,3,4,5,6,7,8,9});
auto z = NDArrayFactory::create<float>('f', {5, 8});
std::vector<void*> buffers(4);
std::vector<Nd4jLong*> shapes(4);
std::vector<Nd4jLong*> hostShapes(4);
x.syncToHost();
z.syncToHost();
for (size_t i = 0; i < buffers.size(); i++) {
buffers[i] = x.specialBuffer();
shapes[i] = x.specialShapeInfo();
hostShapes[i] = x.shapeInfo();
}
Nd4jPointer extra[2];
extra[1] = x.getContext()->getCudaStream();
::concat(extra, 1, 4, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_3) {
auto x = NDArrayFactory::create<float>('c', {2,3}, {1,2,3,4,5,6});
auto y = NDArrayFactory::create<float>('c', {1,3}, {7,8,9});
auto z = NDArrayFactory::create<float>('f', {3, 3});
std::vector<void*> buffers(2);
std::vector<Nd4jLong*> shapes(2);
std::vector<Nd4jLong*> hostShapes(2);
x.syncToHost();
y.syncToHost();
z.syncToHost();
buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo();
buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo();
Nd4jPointer extra[2];
extra[1] = x.getContext()->getCudaStream();
::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_4) {
auto x = NDArrayFactory::create<float>('c', {2,3}, {1,2,3,4,5,6});
auto y = NDArrayFactory::create<float>('c', {1,3}, {7,8,9});
auto z = NDArrayFactory::create<float>('c', {3, 3});
x.syncToHost();
y.syncToHost();
z.syncToHost();
std::vector<void*> buffers(2);
std::vector<Nd4jLong*> shapes(2);
std::vector<Nd4jLong*> hostShapes(2);
buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo();
buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo();
Nd4jPointer extra[2];
extra[1] = x.getContext()->getCudaStream();
::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_5) {
auto x = NDArrayFactory::create<float>('c', {1,2,3}, {1,2,3,4,5,6});
auto y = NDArrayFactory::create<float>('c', {1,2,3}, {7,8,9,10,11, 12});
auto z = NDArrayFactory::create<float>('c', {2, 2, 3});
auto stream = x.getContext()->getCudaStream();//reinterpret_cast<cudaStream_t *>(&nativeStream);
std::vector<void*> buffers(2);
std::vector<Nd4jLong*> shapes(2);
std::vector<Nd4jLong*> hostShapes(2);
buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo();
buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo();
Nd4jPointer extra[2];
extra[1] = x.getContext()->getCudaStream();
::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_6) {
auto x1 = NDArrayFactory::create<float>('c', {2,2,3}, {1,2,3,4,5,6,7,8, 9, 10,11,12});
auto x2 = NDArrayFactory::create<float>('c', {1,2,3}, {13,14,15,16,17, 18});
auto x3 = NDArrayFactory::create<float>('c', {1,2,3}, {19,20,21,22,23, 24});
x1.syncToHost();
x2.syncToHost();
x3.syncToHost();
auto z = NDArrayFactory::create<float>('c', {4, 2, 3});
std::vector<void*> buffers(3);
std::vector<Nd4jLong*> shapes(3);
std::vector<Nd4jLong*> hostShapes(3);
buffers[0] = x1.specialBuffer(); shapes[0] = x1.specialShapeInfo(); hostShapes[0] = x1.shapeInfo();
buffers[1] = x2.specialBuffer(); shapes[1] = x2.specialShapeInfo(); hostShapes[1] = x2.shapeInfo();
buffers[2] = x3.specialBuffer(); shapes[2] = x3.specialShapeInfo(); hostShapes[2] = x3.shapeInfo();
Nd4jPointer extra[2];
extra[1] = x1.getContext()->getCudaStream();
::concat(extra, 0, 3, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_7) {
auto x1 = NDArrayFactory::create<float>(1);
auto x2 = NDArrayFactory::create<float>(2);
auto x3 = NDArrayFactory::create<float>(3);
auto z = NDArrayFactory::create<float>('c', {3}, {1,2,3});
x1.syncToHost();
x2.syncToHost();
x3.syncToHost();
std::vector<void*> buffers(3);
std::vector<Nd4jLong*> shapes(3);
std::vector<Nd4jLong*> hostShapes(3);
buffers[0] = x1.specialBuffer(); shapes[0] = x1.specialShapeInfo(); hostShapes[0] = x1.shapeInfo();
buffers[1] = x2.specialBuffer(); shapes[1] = x2.specialShapeInfo(); hostShapes[1] = x2.shapeInfo();
buffers[2] = x3.specialBuffer(); shapes[2] = x3.specialShapeInfo(); hostShapes[2] = x3.shapeInfo();
Nd4jPointer extra[2];
extra[1] = x1.getContext()->getCudaStream();
::concat(extra, 0, 3, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_8) {
auto totalCount = 1000;
auto width = 300;
std::vector<NDArray> lx(totalCount);
for (int i = 0; i < totalCount; i++) {
lx[i] = NDArrayFactory::create<float>('c', {1, width});
lx[i].assign(i);
lx[i].syncToHost();
}
auto z = NDArrayFactory::create<float>('c', {totalCount, width});
std::vector<void*> buffers(totalCount);
std::vector<Nd4jLong*> shapes(totalCount);
std::vector<Nd4jLong*> hostShapes(totalCount);
for (size_t i = 0; i < lx.size(); i++) {
buffers[i] = lx[i].specialBuffer();
shapes[i] = lx[i].specialShapeInfo();
hostShapes[i] = lx[i].shapeInfo();
}
Nd4jPointer extra[2];
extra[1] = nd4j::LaunchContext::defaultContext()->getCudaStream();
::concat(extra, 0, totalCount, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
}
TEST_F(NDArrayCudaBasicsTests, TestTear_1) {
auto input = NDArrayFactory::create<float>('c', {1, 10, 10});
std::vector<NDArray> arrays; // = {NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10})};
int total = 151;
for (int e = 0; e < total; e++) {
input.assign(e);
arrays.emplace_back(input);
}
auto z = NDArrayFactory::create<float>('c', {total, 10, 10});
Nd4jPointer extra[1];
extra[1] = input.getContext()->getCudaStream();
std::vector<void*> buffers(total);
std::vector<Nd4jLong*> shapes(total);
std::vector<Nd4jLong*> hostShapes(total);
for (size_t i = 0; i < buffers.size(); i++) {
buffers[i] = arrays[i].specialBuffer();
shapes[i] = arrays[i].specialShapeInfo();
hostShapes[i] = arrays[i].shapeInfo();
}
::concat(extra, 0, total, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
nd4j::ops::tear op;
auto result = op.execute({&z}, {}, {1, 2});
//ASSERT_EQ(10, result->size());
auto e = result->size() - 1;
//for (size_t e = 0; e < result->size(); e++) {
// arrays[e].printIndexedBuffer("Input list at 40");
// result->at(e)->printIndexedBuffer("OUtput TEAR at 40");
//}
// ASSERT_TRUE(tads->at(e)->equalsTo(result->at(e)));
delete result;
// delete tads;
}
TEST_F(NDArrayCudaBasicsTests, TestTear_2) {
auto input = NDArrayFactory::create<float>('c', {1, 10, 10});
std::vector<NDArray> arrays; // = {NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10}), NDArrayFactory::create<float>('c', {1, 10, 10})};
for (int e = 0; e < 10; e++) {
input.assign(e);
arrays.emplace_back(input);
arrays[e].syncToHost();
}
auto z = NDArrayFactory::create<float>('c', {10, 10, 10});
Nd4jPointer extra[2];
extra[1] = input.getContext()->getCudaStream();
std::vector<void*> buffers(10);
std::vector<Nd4jLong*> shapes(10);
std::vector<Nd4jLong*> hostShapes(10);
for (size_t i = 0; i < buffers.size(); i++) {
buffers[i] = arrays[i].specialBuffer();
shapes[i] = arrays[i].specialShapeInfo();
hostShapes[i] = arrays[i].shapeInfo();
}
std::vector<int> dimsToExclude({1,2});
::concat(extra, 0, 10, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr);
auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimsToExclude);
//std::vector<void*> arraysData(arrays.size());
Nd4jPointer* arraysData;
cudaError_t err = cudaMalloc(&arraysData, arrays.size() * sizeof(void*));
if (err != 0) {
printf("Cannot allocate device memory for targets due error %d\n", err);
ASSERT_TRUE(false);
}
for (size_t i = 0; i < arrays.size(); i++) {
Nd4jPointer target = arrays[i].specialBuffer();
cudaMemcpy(&arraysData[i], &target, sizeof(Nd4jPointer), cudaMemcpyHostToDevice);
}
::tear(extra, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), arraysData, input.specialShapeInfo(), packX.specialShapeInfo(), packX.specialOffsets());
// auto result = op.execute({&z}, {}, {1, 2});
//ASSERT_EQ(10, result->size());
err = cudaFree(arraysData);
if (err != 0) {
printf("Cannot deallocate device memory for targets due error %d\n", err);
ASSERT_TRUE(false);
}
// ASSERT_TRUE(tads->at(e)->equalsTo(result->at(e)));
// delete result;
// delete tads;
}