From 355c6b60961ee854db88fd4f24f3f96c62c355f4 Mon Sep 17 00:00:00 2001 From: raver119 Date: Thu, 5 Dec 2019 20:03:10 +0300 Subject: [PATCH] [WIP] reverse improvements (#115) * initial commit Signed-off-by: raver119 * reverse draft Signed-off-by: raver119 * reverse kernel Signed-off-by: raver119 * reverse kernel Signed-off-by: raver119 --- .../ops/declarable/helpers/cuda/reverse.cu | 95 +++++++++++++++---- .../layers_tests/DeclarableOpsTests1.cpp | 3 +- .../layers_tests/DeclarableOpsTests16.cpp | 41 ++++++++ .../layers_tests/DeclarableOpsTestsCuda1.cu | 18 +++- 4 files changed, 139 insertions(+), 18 deletions(-) diff --git a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu index aceebf7a0..90e15b21f 100644 --- a/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu +++ b/libnd4j/include/ops/declarable/helpers/cuda/reverse.cu @@ -30,6 +30,67 @@ namespace nd4j { namespace ops { namespace helpers { + template + static __global__ void reverseTadKernel(void* vinput, Nd4jLong *inputShape, void* voutput, Nd4jLong *outputShape, Nd4jLong *inputTadShape, Nd4jLong *inputTadOffsets, Nd4jLong *outputTadShape, Nd4jLong *outputTadOffsets, uint64_t limit, uint64_t numOfElemsToReverse, uint64_t numTads) { + auto input = reinterpret_cast(vinput); + auto output = reinterpret_cast(voutput); + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + const auto step = gridDim.x * blockDim.x; + + // this means that we'll have additional cycle, to move middle element + auto div = numOfElemsToReverse / 2; + auto odd = numOfElemsToReverse % 2 != 0; + auto rlimit = odd ? limit / 2 + 1 : limit / 2; + + // all threads operate in the same input/output space + for (uint64_t e = tid; e < rlimit; e += step) { + // finding out the TAD we're going to process + auto tadId = e / div; + + if (tadId >= numTads) + continue; + + // now finding out element within tad + auto idx = e % div; + + //printf("TID: %i; numTads: %lld; tadLength: %lld; tadId: %i, idx: %lld\n", tid, numTads, numOfElemsToReverse, tadId, idx); + + auto tadInput = input + inputTadOffsets[tadId]; + auto tadOutput = output + outputTadOffsets[tadId]; + + // we're calculating offsets within input TAD + auto fOffset = shape::getIndexOffset(idx, inputTadShape); + auto lOffset = shape::getIndexOffset(numOfElemsToReverse - idx - 1, inputTadShape); + + // now we're storing input values + auto v1 = tadInput[fOffset]; + auto v2 = tadInput[lOffset]; + + // now we're calculating offsets within output TAD + auto zfOffset = shape::getIndexOffset(idx, outputTadShape); + auto zlOffset = shape::getIndexOffset(numOfElemsToReverse - idx - 1, outputTadShape); + + // and saving values to output arrays + tadOutput[zfOffset] = v2; + tadOutput[zlOffset] = v1; + } + + // moving odd element in blocks + if (odd && threadIdx.x == 0) { + for (uint64_t e = blockIdx.x; e < numTads; e += gridDim.x) { + auto tadInput = input + inputTadOffsets[e]; + auto tadOutput = output + outputTadOffsets[e]; + + auto xOffset = shape::getIndexOffset(numOfElemsToReverse / 2, inputTadShape); + auto zOffset = shape::getIndexOffset(numOfElemsToReverse / 2, outputTadShape); + + tadOutput[zOffset] = tadInput[xOffset]; + } + } + + } + + template static __global__ void reverseArrayKernel(void* input, Nd4jLong *inputShape, void* output, Nd4jLong *outputShape, Nd4jLong numOfElemsToReverse) { const auto tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -52,7 +113,7 @@ namespace helpers { auto odd = numOfElemsToReverse % 2 != 0; auto limit = numOfElemsToReverse / 2; - for (Nd4jLong e = tid; e < limit; e += step) { + for (uint64_t e = tid; e < limit; e += step) { // we're calculating offsets within input array auto fOffset = shape::getIndexOffset(e, inputShape); auto lOffset = shape::getIndexOffset(numOfElemsToReverse - e - 1, inputShape); @@ -80,13 +141,19 @@ namespace helpers { } template - static void reverseArray(nd4j::LaunchContext * context, NDArray* input, NDArray* output, Nd4jLong numOfElemsToReverse) { + static void reverseTad(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, Nd4jLong *inputTadShape, Nd4jLong *inputTadOffsets, Nd4jLong *outputTadShape, Nd4jLong *outputTadOffsets, uint64_t tadLength) { + auto stream = context->getCudaStream(); + reverseTadKernel<<<256, 512, 8192, *stream>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), inputTadShape, inputTadOffsets, outputTadShape, outputTadOffsets, input->lengthOf(), tadLength, input->lengthOf() / tadLength); + } + + template + static void reverseArray(nd4j::LaunchContext * context, const NDArray* input, NDArray* output, Nd4jLong numOfElemsToReverse) { auto stream = context->getCudaStream(); Nd4jLong numOfReverse = numOfElemsToReverse; if (numOfElemsToReverse == 0) numOfReverse = input->lengthOf(); - reverseArrayKernel<<<256, 512, 8192, *stream>>>(input->specialBuffer(), input->specialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), numOfReverse); + reverseArrayKernel<<<256, 512, 8192, *stream>>>(input->getSpecialBuffer(), input->getSpecialShapeInfo(), output->specialBuffer(), output->specialShapeInfo(), numOfReverse); } @@ -153,27 +220,23 @@ namespace helpers { // we need to reverse axis only if that's new op std::vector dimensions = isBackProp ? ShapeUtils::evalDimsToExclude(input->rankOf(), *intArgs) : *intArgs; std::vector axis = ShapeUtils::evalDimsToExclude(input->rankOf(), dimensions); - auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), axis); - auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), axis); + auto packX = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(input->getShapeInfo(), dimensions); + auto packZ = nd4j::ConstantTadHelper::getInstance()->tadForDimensions(output->getShapeInfo(), dimensions); - auto listOut = output->allTensorsAlongDimension(dimensions); - auto listIn = input->allTensorsAlongDimension(dimensions); - NDArray *subArrIn, *subArrOut; NDArray::prepareSpecialUse({output}, {input}); - for(int i = 0; i < listIn->size(); ++i) { // listIn->size() = listOut->size() - subArrIn = listIn->at(i); - subArrOut = listOut->at(i); - BUILD_SINGLE_SELECTOR(input->dataType(), reverseArray, (context, subArrIn, subArrOut, 0), LIBND4J_TYPES); + + if (packX.numberOfTads() == 1) { + BUILD_SINGLE_SELECTOR(input->dataType(), reverseArray, (context, input, output, 0), LIBND4J_TYPES); + } else { + BUILD_SINGLE_SELECTOR(input->dataType(), reverseTad, (context, input, output, packX.platformShapeInfo(), packX.platformOffsets(), packZ.platformShapeInfo(), packZ.platformOffsets(), (uint64_t) (input->lengthOf() / packX.numberOfTads())), LIBND4J_TYPES); } - //BUILD_SINGLE_SELECTOR(input->dataType(), reverseArray, (context, const_cast(input), output, (int)0), LIBND4J_TYPES); + NDArray::registerSpecialUse({output}, {input}); - delete listOut; - delete listIn; } -BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, NDArray *inArr, NDArray *outArr, Nd4jLong numOfElemsToReverse), LIBND4J_TYPES); +BUILD_SINGLE_TEMPLATE(template void reverseArray, (nd4j::LaunchContext * context, const NDArray *inArr, NDArray *outArr, Nd4jLong numOfElemsToReverse), LIBND4J_TYPES); } } diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp index 7036ef77f..60351cc52 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests1.cpp @@ -3523,7 +3523,8 @@ TEST_F(DeclarableOpsTests1, Reverse_7 ) { ASSERT_EQ(ND4J_STATUS_OK, results->status()); auto result = results->at(0); - // result->printBuffer(); + //expected.printIndexedBuffer("E"); + //result->printIndexedBuffer("R"); ASSERT_TRUE(expected.isSameShapeStrict(result)); ASSERT_TRUE(expected.equalsTo(result)); diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp index 38d88b469..f8bf47e53 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTests16.cpp @@ -196,4 +196,45 @@ TEST_F(DeclarableOpsTests16, test_range_2) { ASSERT_TRUE(shape::shapeEquals(z.shapeInfo(), shapes->at(0))); delete shapes; +} + +TEST_F(DeclarableOpsTests16, test_reverse_1) { + std::vector rows = {3, 5, 7, 8, 9, 10, 119, 211}; + std::vector columns = {6, 5, 10, 100, 153, 171, 635}; + + for (auto r : rows) { + for (auto c : columns) { + //nd4j_printf("Trying [%i, %i]\n", r, c); + auto array = NDArrayFactory::create('c', {r, c}); + auto exp = NDArrayFactory::create('c', {r, c}); + auto reversed = NDArrayFactory::create('c', {r, c}); + + auto rowOriginal = NDArrayFactory::create('c', {c}); + auto rowReversed = NDArrayFactory::create('c', {c}); + + for (int e = 0; e < c; e++) { + rowOriginal.p(e, (float) e); + rowReversed.p(c - e - 1, (float) e); + } + + + auto listI = array.allTensorsAlongDimension({1}); + auto listE = exp.allTensorsAlongDimension({1}); + + for (int e = 0; e < r; e++) { + listI->at(e)->assign(rowOriginal); + listE->at(e)->assign(rowReversed); + } + + delete listI; + delete listE; + + nd4j::ops::reverse op; + Nd4jLong axis = 1; + auto status = op.execute({&array}, {&reversed}, {}, {axis}, {}); + ASSERT_EQ(Status::OK(), status); + + ASSERT_EQ(exp, reversed); + } + } } \ No newline at end of file diff --git a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu index 161b96918..f88cddde5 100644 --- a/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu +++ b/libnd4j/tests_cpu/layers_tests/DeclarableOpsTestsCuda1.cu @@ -24,6 +24,7 @@ #include #include #include +#include using namespace nd4j; @@ -58,5 +59,20 @@ TEST_F(DeclarableOpsTestsCuda1, Test_CHOOSE_SCALAR_LARGE) { //ASSERT_TRUE(exp.isSameShape(z)); delete result; +} -} \ No newline at end of file +/* +TEST_F(DeclarableOpsTestsCuda1, Test_Reverse_TAD_1) { + auto x = NDArrayFactory::create('c', {1, 3, 608, 608}); + auto z = x.like(); + x.linspace(1.0f); + + nd4j::ops::reverse op; + auto timeStart = std::chrono::system_clock::now(); + auto status = op.execute({&x}, {&z}, {}, {1}, {}); + auto timeEnd = std::chrono::system_clock::now(); + auto outerTime = std::chrono::duration_cast (timeEnd - timeStart).count(); + nd4j_printf("exec time: %lld us\n", outerTime); + ASSERT_EQ(Status::OK(), status); +} +*/ \ No newline at end of file