diff --git a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu index 709f0ed2c..6587b4ca7 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/gather_nd.cu @@ -106,6 +106,7 @@ namespace nd4j { const auto xOffset = shape::getOffset(0, xShapeInfo + 1, xShapeInfo + xRank + 1, xCoordStart, xRank); z[zOffset] = x[xOffset]; + printf("z[%lld] = x[%lld] = %f\n", zOffset, xOffset, (float) z[zOffset]); } } @@ -124,7 +125,7 @@ namespace nd4j { const int maxRank = nd4j::math::nd4j_max(indices.rankOf(), nd4j::math::nd4j_max(input.rankOf(), output.rankOf())); - const int threadsPerBlock = MAX_NUM_THREADS; + const int threadsPerBlock = 256; const int blocksPerGrid = (output.lengthOf() + threadsPerBlock - 1) / threadsPerBlock; const int sharedMem = 8 * threadsPerBlock * maxRank + 128; diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp index b596ebcd5..1fbe81046 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests5.cpp @@ -815,6 +815,23 @@ TEST_F(DeclarableOpsTests5, gatherNd_test7) { delete results; } +////////////////////////////////////////////////////////////////////// +TEST_F(DeclarableOpsTests5, gatherNd_test8) { + auto x = NDArrayFactory::create('c', {2, 2}, {1., 2., 3., 4.}); + auto y = NDArrayFactory::create('c', {2, 2}, {0, 0, 1, 1}); + auto e = NDArrayFactory::create('c', {2}, {1., 4.}); + + nd4j::ops::gather_nd op; + auto result = op.execute({&x, &y}, {}, {}); + ASSERT_EQ(Status::OK(), result->status()); + + auto z = result->at(0); + + ASSERT_EQ(e, *z); + + delete result; +} + ////////////////////////////////////////////////////////////////////// TEST_F(DeclarableOpsTests5, reverse_sequense_test1) { diff --git a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu index 7b9e788f7..4ab884d28 100644 --- a/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu +++ b/libnd4j/tests_cpu/layers_tests/NDArrayCudaBasicsTests.cu @@ -2261,304 +2261,4 @@ TEST_F(NDArrayCudaBasicsTests, Test_Empty_4) { ASSERT_TRUE(x->isEmpty()); delete x; -} - -// printCudaGlobal<<<1,1,0,*stream>>>(dX, 6); -// printCudaGlobal<<<1,1,0,*stream>>>(dXShapeInfo, 8); -// printCudaGlobal<<<1,1,0,*stream>>>(dZ, 2); -// printCudaGlobal<<<1,1,0,*stream>>>(dZShapeInfo, 6); -// printCudaGlobal<<<1,1,0,*stream>>>(dimension, 1); -// printCudaGlobal<<<1,1,0,*stream>>>(tadShapeInfo, 6); -// printCudaGlobal<<<1,1,0,*stream>>>(tadOffsets, 2); -// cudaStreamSynchronize(*stream); - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_1) { - - auto x = NDArrayFactory::create('c', {5,2}, {0,1,2,3,4,5,6,7,8,9}); - x.syncToHost(); - auto z = NDArrayFactory::create('c', {5, 8}); - z.syncToHost(); - - std::vector buffers(4); - std::vector shapes(4); - std::vector hostShapes(4); - - for (size_t i = 0; i < buffers.size(); i++) { - buffers[i] = x.specialBuffer(); - shapes[i] = x.specialShapeInfo(); - hostShapes[i] = x.shapeInfo(); - } - Nd4jPointer extra[2]; - extra[1] = x.getContext()->getCudaStream(); - ::concat(extra, 1, 4, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_2) { - - auto x = NDArrayFactory::create('c', {5,2}, {0,1,2,3,4,5,6,7,8,9}); - auto z = NDArrayFactory::create('f', {5, 8}); - - std::vector buffers(4); - std::vector shapes(4); - std::vector hostShapes(4); - - x.syncToHost(); - z.syncToHost(); - - for (size_t i = 0; i < buffers.size(); i++) { - buffers[i] = x.specialBuffer(); - shapes[i] = x.specialShapeInfo(); - hostShapes[i] = x.shapeInfo(); - } - - Nd4jPointer extra[2]; - extra[1] = x.getContext()->getCudaStream(); - - ::concat(extra, 1, 4, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_3) { - - auto x = NDArrayFactory::create('c', {2,3}, {1,2,3,4,5,6}); - auto y = NDArrayFactory::create('c', {1,3}, {7,8,9}); - auto z = NDArrayFactory::create('f', {3, 3}); - - - std::vector buffers(2); - std::vector shapes(2); - std::vector hostShapes(2); - - x.syncToHost(); - y.syncToHost(); - z.syncToHost(); - - buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo(); - buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo(); - - Nd4jPointer extra[2]; - extra[1] = x.getContext()->getCudaStream(); - - ::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_4) { - - auto x = NDArrayFactory::create('c', {2,3}, {1,2,3,4,5,6}); - auto y = NDArrayFactory::create('c', {1,3}, {7,8,9}); - auto z = NDArrayFactory::create('c', {3, 3}); - - x.syncToHost(); - y.syncToHost(); - z.syncToHost(); - - std::vector buffers(2); - std::vector shapes(2); - std::vector hostShapes(2); - - buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo(); - buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo(); - - Nd4jPointer extra[2]; - extra[1] = x.getContext()->getCudaStream(); - - ::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_5) { - - auto x = NDArrayFactory::create('c', {1,2,3}, {1,2,3,4,5,6}); - auto y = NDArrayFactory::create('c', {1,2,3}, {7,8,9,10,11, 12}); - - auto z = NDArrayFactory::create('c', {2, 2, 3}); - auto stream = x.getContext()->getCudaStream();//reinterpret_cast(&nativeStream); - std::vector buffers(2); - std::vector shapes(2); - std::vector hostShapes(2); - - buffers[0] = x.specialBuffer(); shapes[0] = x.specialShapeInfo(); hostShapes[0] = x.shapeInfo(); - buffers[1] = y.specialBuffer(); shapes[1] = y.specialShapeInfo(); hostShapes[1] = y.shapeInfo(); - - Nd4jPointer extra[2]; - extra[1] = x.getContext()->getCudaStream(); - - ::concat(extra, 0, 2, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_6) { - - auto x1 = NDArrayFactory::create('c', {2,2,3}, {1,2,3,4,5,6,7,8, 9, 10,11,12}); - auto x2 = NDArrayFactory::create('c', {1,2,3}, {13,14,15,16,17, 18}); - auto x3 = NDArrayFactory::create('c', {1,2,3}, {19,20,21,22,23, 24}); - - x1.syncToHost(); - x2.syncToHost(); - x3.syncToHost(); - - auto z = NDArrayFactory::create('c', {4, 2, 3}); - - std::vector buffers(3); - std::vector shapes(3); - std::vector hostShapes(3); - - buffers[0] = x1.specialBuffer(); shapes[0] = x1.specialShapeInfo(); hostShapes[0] = x1.shapeInfo(); - buffers[1] = x2.specialBuffer(); shapes[1] = x2.specialShapeInfo(); hostShapes[1] = x2.shapeInfo(); - buffers[2] = x3.specialBuffer(); shapes[2] = x3.specialShapeInfo(); hostShapes[2] = x3.shapeInfo(); - - Nd4jPointer extra[2]; - extra[1] = x1.getContext()->getCudaStream(); - - ::concat(extra, 0, 3, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_7) { - - auto x1 = NDArrayFactory::create(1); - auto x2 = NDArrayFactory::create(2); - auto x3 = NDArrayFactory::create(3); - - auto z = NDArrayFactory::create('c', {3}, {1,2,3}); - - x1.syncToHost(); - x2.syncToHost(); - x3.syncToHost(); - - std::vector buffers(3); - std::vector shapes(3); - std::vector hostShapes(3); - - buffers[0] = x1.specialBuffer(); shapes[0] = x1.specialShapeInfo(); hostShapes[0] = x1.shapeInfo(); - buffers[1] = x2.specialBuffer(); shapes[1] = x2.specialShapeInfo(); hostShapes[1] = x2.shapeInfo(); - buffers[2] = x3.specialBuffer(); shapes[2] = x3.specialShapeInfo(); hostShapes[2] = x3.shapeInfo(); - - Nd4jPointer extra[2]; - extra[1] = x1.getContext()->getCudaStream(); - - ::concat(extra, 0, 3, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, Test_ConcatNative_8) { - - auto totalCount = 1000; - auto width = 300; - std::vector lx(totalCount); - for (int i = 0; i < totalCount; i++) { - lx[i] = NDArrayFactory::create('c', {1, width}); - lx[i].assign(i); - lx[i].syncToHost(); - } - - auto z = NDArrayFactory::create('c', {totalCount, width}); - - std::vector buffers(totalCount); - std::vector shapes(totalCount); - std::vector hostShapes(totalCount); - - for (size_t i = 0; i < lx.size(); i++) { - buffers[i] = lx[i].specialBuffer(); - shapes[i] = lx[i].specialShapeInfo(); - hostShapes[i] = lx[i].shapeInfo(); - } - - Nd4jPointer extra[2]; - extra[1] = nd4j::LaunchContext::defaultContext()->getCudaStream(); - - ::concat(extra, 0, totalCount, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); -} - -TEST_F(NDArrayCudaBasicsTests, TestTear_1) { - auto input = NDArrayFactory::create('c', {1, 10, 10}); - std::vector arrays; // = {NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10})}; - int total = 151; - for (int e = 0; e < total; e++) { - input.assign(e); - arrays.emplace_back(input); - } - auto z = NDArrayFactory::create('c', {total, 10, 10}); - - Nd4jPointer extra[1]; - extra[1] = input.getContext()->getCudaStream(); - - std::vector buffers(total); - std::vector shapes(total); - std::vector hostShapes(total); - - for (size_t i = 0; i < buffers.size(); i++) { - buffers[i] = arrays[i].specialBuffer(); - shapes[i] = arrays[i].specialShapeInfo(); - hostShapes[i] = arrays[i].shapeInfo(); - } - - ::concat(extra, 0, total, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); - nd4j::ops::tear op; - - auto result = op.execute({&z}, {}, {1, 2}); - //ASSERT_EQ(10, result->size()); - auto e = result->size() - 1; - //for (size_t e = 0; e < result->size(); e++) { -// arrays[e].printIndexedBuffer("Input list at 40"); -// result->at(e)->printIndexedBuffer("OUtput TEAR at 40"); - //} -// ASSERT_TRUE(tads->at(e)->equalsTo(result->at(e))); - - delete result; -// delete tads; -} - -TEST_F(NDArrayCudaBasicsTests, TestTear_2) { - - auto input = NDArrayFactory::create('c', {1, 10, 10}); - - std::vector arrays; // = {NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10}), NDArrayFactory::create('c', {1, 10, 10})}; - for (int e = 0; e < 10; e++) { - input.assign(e); - arrays.emplace_back(input); - arrays[e].syncToHost(); - } - - auto z = NDArrayFactory::create('c', {10, 10, 10}); - - Nd4jPointer extra[2]; - extra[1] = input.getContext()->getCudaStream(); - - std::vector buffers(10); - std::vector shapes(10); - std::vector hostShapes(10); - - for (size_t i = 0; i < buffers.size(); i++) { - buffers[i] = arrays[i].specialBuffer(); - shapes[i] = arrays[i].specialShapeInfo(); - hostShapes[i] = arrays[i].shapeInfo(); - } - - std::vector dimsToExclude({1,2}); - - - ::concat(extra, 0, 10, nullptr, (Nd4jPointer*)hostShapes.data(), (Nd4jPointer*)buffers.data(), (Nd4jPointer*)shapes.data(), nullptr, z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), nullptr, nullptr); - - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input.getShapeInfo(), dimsToExclude); - //std::vector arraysData(arrays.size()); - Nd4jPointer* arraysData; - cudaError_t err = cudaMalloc(&arraysData, arrays.size() * sizeof(void*)); - if (err != 0) { - printf("Cannot allocate device memory for targets due error %d\n", err); - ASSERT_TRUE(false); - } - for (size_t i = 0; i < arrays.size(); i++) { - Nd4jPointer target = arrays[i].specialBuffer(); - cudaMemcpy(&arraysData[i], &target, sizeof(Nd4jPointer), cudaMemcpyHostToDevice); - } - ::tear(extra, z.buffer(), z.shapeInfo(), z.specialBuffer(), z.specialShapeInfo(), arraysData, input.specialShapeInfo(), packX.specialShapeInfo(), packX.specialOffsets()); -// auto result = op.execute({&z}, {}, {1, 2}); - - //ASSERT_EQ(10, result->size()); - err = cudaFree(arraysData); - if (err != 0) { - printf("Cannot deallocate device memory for targets due error %d\n", err); - ASSERT_TRUE(false); - } - -// ASSERT_TRUE(tads->at(e)->equalsTo(result->at(e))); - -// delete result; -// delete tads; -} +} \ No newline at end of file